diff --git a/extensions/database/module/scripts/index/database-import-controller.js b/extensions/database/module/scripts/index/database-import-controller.js index 8ca32f0e52f2..ee4af9a58d75 100644 --- a/extensions/database/module/scripts/index/database-import-controller.js +++ b/extensions/database/module/scripts/index/database-import-controller.js @@ -309,7 +309,7 @@ Refine.DatabaseImportController.prototype._getPreviewData = function(callback, n result.rowModel = data; callback(result); }, - "jsonp" + "json" ); }, "json" diff --git a/extensions/wikibase/module/scripts/dialogs/perform-edits-dialog.js b/extensions/wikibase/module/scripts/dialogs/perform-edits-dialog.js index e515bfee6524..e4e07806059e 100644 --- a/extensions/wikibase/module/scripts/dialogs/perform-edits-dialog.js +++ b/extensions/wikibase/module/scripts/dialogs/perform-edits-dialog.js @@ -60,7 +60,7 @@ PerformEditsDialog.launch = function(logged_in_username, max_severity) { tag: WikibaseManager.getSelectedWikibaseTagTemplate(), maxEditsPerMinute: WikibaseManager.getSelectedWikibaseMaxEditsPerMinute() }, - { includeEngine: true, cellsChanged: true, columnStatsChanged: true }, + { includeEngine: true, cellsChanged: true, columnStatsChanged: true, rowIdsPreserved: true, recordIdsPreserved: true }, { onDone: function() { dismiss(); } } ); }; diff --git a/extensions/wikibase/src/org/openrefine/wikibase/schema/WikibaseSchema.java b/extensions/wikibase/src/org/openrefine/wikibase/schema/WikibaseSchema.java index 0bd434de9c7e..4279445aff79 100644 --- a/extensions/wikibase/src/org/openrefine/wikibase/schema/WikibaseSchema.java +++ b/extensions/wikibase/src/org/openrefine/wikibase/schema/WikibaseSchema.java @@ -42,7 +42,6 @@ import org.openrefine.model.IndexedRow; import org.openrefine.model.Project; import org.openrefine.overlay.OverlayModel; -import org.openrefine.sorting.SortingConfig; import org.openrefine.util.ParsingUtilities; import org.openrefine.wikibase.qa.QAWarningStore; import org.openrefine.wikibase.schema.exceptions.QAWarningException; @@ -199,7 +198,7 @@ public List evaluate(GridState grid, Engine engine, QAWarningStore w throw new IllegalStateException("The schema has not been validated before being evaluated"); } List result = new ArrayList<>(); - for (IndexedRow indexedRow : grid.iterateRows(engine.combinedRowFilters(), SortingConfig.NO_SORTING)) { + for (IndexedRow indexedRow : grid.iterateRows(engine.combinedRowFilters())) { ExpressionContext ctxt = new ExpressionContext( siteIri, entityTypeSiteIri, diff --git a/main/src/main/java/org/openrefine/commands/expr/PreviewExpressionCommand.java b/main/src/main/java/org/openrefine/commands/expr/PreviewExpressionCommand.java index 0ffa6af4870d..4f531d419230 100644 --- a/main/src/main/java/org/openrefine/commands/expr/PreviewExpressionCommand.java +++ b/main/src/main/java/org/openrefine/commands/expr/PreviewExpressionCommand.java @@ -201,7 +201,12 @@ public void doPost(HttpServletRequest request, HttpServletResponse response) Properties bindings = ExpressionUtils.createBindings(); if (Mode.RowBased.equals(engine.getMode())) { - List rows = state.getRows(engine.combinedRowFilters(), sortingConfig, 0, limit); + GridState sorted = state; + if (!SortingConfig.NO_SORTING.equals(sortingConfig)) { + // TODO cache appropriately + sorted = state.reorderRows(sortingConfig, false); + } + List rows = sorted.getRowsAfter(engine.combinedRowFilters(), 0, limit); for (IndexedRow indexedRow : rows) { Cell cell = indexedRow.getRow().getCell(cellIndex); Record record = null; @@ -210,7 +215,12 @@ public void doPost(HttpServletRequest request, HttpServletResponse response) repeat, repeatCount)); } } else { - List records = state.getRecords(engine.combinedRecordFilters(), sortingConfig, 0, limit); + GridState sorted = state; + if (!SortingConfig.NO_SORTING.equals(sortingConfig)) { + // TODO cache appropriately + sorted = state.reorderRecords(sortingConfig, false); + } + List records = sorted.getRecordsAfter(engine.combinedRecordFilters(), 0, limit); for (Record record : records) { for (IndexedRow indexedRow : record.getIndexedRows()) { Cell cell = indexedRow.getRow().getCell(cellIndex); diff --git a/main/src/main/java/org/openrefine/commands/recon/GuessTypesOfColumnCommand.java b/main/src/main/java/org/openrefine/commands/recon/GuessTypesOfColumnCommand.java index 726e5adfd735..1935ee581d00 100644 --- a/main/src/main/java/org/openrefine/commands/recon/GuessTypesOfColumnCommand.java +++ b/main/src/main/java/org/openrefine/commands/recon/GuessTypesOfColumnCommand.java @@ -149,7 +149,7 @@ protected List guessTypes(GridState gridState, int cellIndex, String List samples = new ArrayList(sampleSize); Set sampleSet = new HashSet(); - for (IndexedRow row : gridState.getRows(0, sampleSize)) { + for (IndexedRow row : gridState.getRowsAfter(0, sampleSize)) { Object value = row.getRow().getCellValue(cellIndex); if (ExpressionUtils.isNonBlankData(value)) { String s = CharMatcher.whitespace().trimFrom(value.toString()); diff --git a/main/src/main/java/org/openrefine/commands/recon/PreviewExtendDataCommand.java b/main/src/main/java/org/openrefine/commands/recon/PreviewExtendDataCommand.java index 5a120d50b529..538d1baabde6 100644 --- a/main/src/main/java/org/openrefine/commands/recon/PreviewExtendDataCommand.java +++ b/main/src/main/java/org/openrefine/commands/recon/PreviewExtendDataCommand.java @@ -135,7 +135,11 @@ public void doPost(HttpServletRequest request, HttpServletResponse response) GridState state = project.getCurrentGridState(); Engine engine = new Engine(state, engineConfig); - List previewRows = state.getRows(engine.combinedRowFilters(), sortingConfig, 0, limit); + GridState sorted = state; + if (!SortingConfig.NO_SORTING.equals(sortingConfig)) { + sorted = sorted.reorderRows(sortingConfig, false); + } + List previewRows = state.getRowsAfter(engine.combinedRowFilters(), 0, limit); for (IndexedRow indexedRow : previewRows) { try { Row row = indexedRow.getRow(); diff --git a/main/src/main/java/org/openrefine/commands/row/GetRowsCommand.java b/main/src/main/java/org/openrefine/commands/row/GetRowsCommand.java index 510de3f570fb..82b3131dbe6b 100644 --- a/main/src/main/java/org/openrefine/commands/row/GetRowsCommand.java +++ b/main/src/main/java/org/openrefine/commands/row/GetRowsCommand.java @@ -34,7 +34,6 @@ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT package org.openrefine.commands.row; import java.io.IOException; -import java.io.PrintWriter; import java.util.List; import java.util.stream.Collectors; import java.util.stream.IntStream; @@ -63,24 +62,44 @@ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT import org.openrefine.model.Row; import org.openrefine.model.RowFilter; import org.openrefine.sorting.SortingConfig; -import org.openrefine.util.ParsingUtilities; +/** + * Retrieves rows from a project (or importing job). + * + * Those rows can be requested as either: - the batch of rows starting at a given index (included), up to a certain size + * - the batch of rows ending at a given index (excluded), again up to a given size. Filters (defined by facets) and the + * row/record mode toggle can also be provided. + */ public class GetRowsCommand extends Command { protected static class WrappedRow { @JsonUnwrapped protected final Row row; + + /** + * The logical row index (meaning the one from the original, unsorted grid) + */ @JsonProperty("i") protected final long rowIndex; @JsonProperty("j") @JsonInclude(Include.NON_NULL) protected final Long recordIndex; - protected WrappedRow(Row rowOrRecord, long rowIndex, Long recordIndex) { + /** + * The row index used for pagination (which can be different from the logical one if a temporary sort is + * applied) + */ + @JsonProperty("k") + protected final long paginationIndex; + + // TODO add indices for pagination purposes + + protected WrappedRow(Row rowOrRecord, long rowIndex, Long recordIndex, long paginationIndex) { this.row = rowOrRecord; this.rowIndex = rowIndex; this.recordIndex = recordIndex; + this.paginationIndex = paginationIndex; } } @@ -112,21 +131,47 @@ protected static class JsonResult { */ @JsonProperty("total") protected final long totalCount; + /** + * Total number of rows in the unfiltered grid (needed to provide a link to the last page) + */ + @JsonProperty("totalRows") + protected final long totalRows; @JsonProperty("start") - protected final long start; + @JsonInclude(Include.NON_NULL) + protected final Long start; + @JsonProperty("end") + @JsonInclude(Include.NON_NULL) + protected final Long end; @JsonProperty("limit") protected final int limit; + /** + * The value to use as 'end' when fetching the page before this one. Can be null if there is no such page. + */ + @JsonProperty("previousPageId") + @JsonInclude(Include.NON_NULL) + protected final Long previousPageId; + /** + * The value to use as 'start' when fetching the page after this one. Can be null if there is no such page. + */ + @JsonProperty("nextPageId") + @JsonInclude(Include.NON_NULL) + protected final Long nextPageId; + protected JsonResult(Mode mode, List rows, long filtered, - long processed, long totalCount, long start, int limit) { + long processed, long totalCount, long totalRows, long start, long end, int limit, Long previousPageId, Long nextPageId) { this.mode = mode; this.rows = rows; this.filtered = filtered; this.processed = processed; this.totalCount = totalCount; - this.start = start; + this.totalRows = totalRows; + this.start = start == -1 ? null : start; + this.end = end == -1 ? null : end; this.limit = Math.min(rows.size(), limit); + this.previousPageId = previousPageId; + this.nextPageId = nextPageId; } } @@ -148,7 +193,11 @@ public void doGet(HttpServletRequest request, HttpServletResponse response) thro protected static List recordToWrappedRows(Record record) { List rows = record.getRows(); return IntStream.range(0, rows.size()) - .mapToObj(i -> new WrappedRow(rows.get(i), record.getStartRowId() + i, i == 0 ? record.getStartRowId() : null)) + .mapToObj(i -> new WrappedRow( + rows.get(i), + record.getLogicalStartRowId() + i, + i == 0 ? record.getLogicalStartRowId() : null, + record.getStartRowId() + i)) .collect(Collectors.toList()); } @@ -172,19 +221,18 @@ protected void internalRespond(HttpServletRequest request, HttpServletResponse r } Engine engine = getEngine(request, project); - String callback = request.getParameter("callback"); GridState entireGrid = project.getCurrentGridState(); - long start = getLongParameter(request, "start", 0); + long start = getLongParameter(request, "start", -1L); + long end = getLongParameter(request, "end", -1L); int limit = Math.max(0, getIntegerParameter(request, "limit", 20)); response.setCharacterEncoding("UTF-8"); - response.setHeader("Content-Type", callback == null ? "application/json" : "text/javascript"); + response.setHeader("Content-Type", "application/json"); - PrintWriter writer = response.getWriter(); - if (callback != null) { - writer.write(callback); - writer.write("("); + if ((start == -1L) == (end == -1L)) { + respondException(response, new IllegalArgumentException("Exactly one of 'start' and 'end' should be provided.")); + return; } SortingConfig sortingConfig = SortingConfig.NO_SORTING; @@ -194,6 +242,8 @@ protected void internalRespond(HttpServletRequest request, HttpServletResponse r sortingConfig = SortingConfig.reconstruct(sortingJson); } } catch (IOException e) { + respondException(response, e); + return; } long filtered; @@ -205,15 +255,26 @@ protected void internalRespond(HttpServletRequest request, HttpServletResponse r if (engine.getMode() == Mode.RowBased) { totalCount = entireGrid.rowCount(); RowFilter combinedRowFilters = engine.combinedRowFilters(); - List rows = entireGrid.getRows(combinedRowFilters, sortingConfig, start, limit); + GridState sortedGrid = entireGrid; + if (!SortingConfig.NO_SORTING.equals(sortingConfig)) { + // TODO cache this appropriately + sortedGrid = entireGrid.reorderRows(sortingConfig, false); + } + List rows; + if (start != -1L) { + rows = sortedGrid.getRowsAfter(combinedRowFilters, start, limit); + } else { + rows = sortedGrid.getRowsBefore(combinedRowFilters, end, limit); + } wrappedRows = rows.stream() - .map(tuple -> new WrappedRow(tuple.getRow(), tuple.getIndex(), null)) + .map(tuple -> new WrappedRow(tuple.getRow(), tuple.getLogicalIndex(), null, tuple.getIndex())) .collect(Collectors.toList()); if (engineConfig.isNeutral()) { filtered = totalCount; processed = totalCount; } else { + // still using the unsorted grid for performance considerations if (engineConfig.getAggregationLimit() == null) { filtered = entireGrid.countMatchingRows(combinedRowFilters); processed = totalCount; @@ -227,7 +288,17 @@ protected void internalRespond(HttpServletRequest request, HttpServletResponse r } else { totalCount = entireGrid.recordCount(); RecordFilter combinedRecordFilters = engine.combinedRecordFilters(); - List records = entireGrid.getRecords(combinedRecordFilters, sortingConfig, start, limit); + GridState sortedGrid = entireGrid; + if (!SortingConfig.NO_SORTING.equals(sortingConfig)) { + // TODO cache this appropriately + sortedGrid = entireGrid.reorderRecords(sortingConfig, false); + } + List records; + if (start != -1L) { + records = sortedGrid.getRecordsAfter(combinedRecordFilters, start, limit); + } else { + records = sortedGrid.getRecordsBefore(combinedRecordFilters, end, limit); + } wrappedRows = records.stream() .flatMap(record -> recordToWrappedRows(record).stream()) @@ -237,6 +308,7 @@ protected void internalRespond(HttpServletRequest request, HttpServletResponse r filtered = totalCount; processed = totalCount; } else { + // still using the unsorted grid for performance considerations if (engineConfig.getAggregationLimit() == null) { filtered = entireGrid.countMatchingRecords(combinedRecordFilters); processed = totalCount; @@ -249,19 +321,33 @@ protected void internalRespond(HttpServletRequest request, HttpServletResponse r } } - JsonResult result = new JsonResult(engine.getMode(), - wrappedRows, filtered, processed, - totalCount, start, limit); - - ParsingUtilities.defaultWriter.writeValue(writer, result); - if (callback != null) { - writer.write(")"); + // Compute the indices of the previous and next pages + Long previousPageId = null; + Long nextPageId = null; + if (start != -1L) { + if (start > 0) { + previousPageId = start; + } + if (!wrappedRows.isEmpty()) { + nextPageId = wrappedRows.get(wrappedRows.size() - 1).paginationIndex + 1; + } + } else { + if (!wrappedRows.isEmpty() && wrappedRows.get(0).paginationIndex > 0) { + previousPageId = wrappedRows.get(0).paginationIndex; + } + nextPageId = end; } // metadata refresh for row mode and record mode if (project.getMetadata() != null) { project.getMetadata().setRowCount(totalCount); } + + JsonResult result = new JsonResult(engine.getMode(), + wrappedRows, filtered, processed, + totalCount, entireGrid.rowCount(), start, end, limit, previousPageId, nextPageId); + + respondJSON(response, result); } catch (Exception e) { respondException(response, e); } diff --git a/main/src/test/java/org/openrefine/commands/row/GetRowsCommandTest.java b/main/src/test/java/org/openrefine/commands/row/GetRowsCommandTest.java index 7cb499ef76b6..a579cff41deb 100644 --- a/main/src/test/java/org/openrefine/commands/row/GetRowsCommandTest.java +++ b/main/src/test/java/org/openrefine/commands/row/GetRowsCommandTest.java @@ -70,7 +70,7 @@ public void setUp() { } @Test - public void testJsonOutputRows() throws ServletException, IOException { + public void testJsonOutputRowsStart() throws ServletException, IOException { String rowJson = "{\n" + " \"filtered\" : 5,\n" + " \"limit\" : 2,\n" + @@ -83,6 +83,7 @@ public void testJsonOutputRows() throws ServletException, IOException { " } ],\n" + " \"flagged\" : false,\n" + " \"i\" : 0,\n" + + " \"k\" : 0,\n" + " \"starred\" : false\n" + " }, {\n" + " \"cells\" : [ null, {\n" + @@ -90,17 +91,129 @@ public void testJsonOutputRows() throws ServletException, IOException { " } ],\n" + " \"flagged\" : false,\n" + " \"i\" : 1,\n" + + " \"k\" : 1,\n" + " \"starred\" : false\n" + " } ],\n" + " \"start\" : 0,\n" + + " \"nextPageId\": 2,\n" + " \"total\" : 5,\n" + + " \"totalRows\" : 5,\n" + " \"processed\": 5\n" + " }"; when(request.getParameter("engine")).thenReturn("{\"mode\":\"row-based\",\"facets\":[]}"); + when(request.getParameter("start")).thenReturn("0"); when(request.getParameter("limit")).thenReturn("2"); command.doPost(request, response); - TestUtils.assertEqualAsJson(rowJson, writer.toString()); + TestUtils.assertEqualsAsJson(writer.toString(), rowJson); + } + + @Test + public void testJsonOutputRowsStartWithNoNextPage() throws ServletException, IOException { + String rowJson = "{\n" + + " \"filtered\" : 5,\n" + + " \"limit\" : 2,\n" + + " \"mode\" : \"row-based\",\n" + + " \"rows\" : [ {\n" + + " \"cells\" : [ {\n" + + " \"v\" : \"a\"\n" + + " }, {\n" + + " \"v\" : \"b\"\n" + + " } ],\n" + + " \"flagged\" : false,\n" + + " \"i\" : 0,\n" + + " \"k\" : 0,\n" + + " \"starred\" : false\n" + + " }, {\n" + + " \"cells\" : [ null, {\n" + + " \"v\" : \"c\"\n" + + " } ],\n" + + " \"flagged\" : false,\n" + + " \"i\" : 1,\n" + + " \"k\" : 1,\n" + + " \"starred\" : false\n" + + " } ],\n" + + " \"start\" : 0,\n" + + " \"nextPageId\": 2,\n" + + " \"total\" : 5,\n" + + " \"totalRows\" : 5,\n" + + " \"processed\": 5\n" + + " }"; + + when(request.getParameter("engine")).thenReturn("{\"mode\":\"row-based\",\"facets\":[]}"); + when(request.getParameter("start")).thenReturn("0"); + when(request.getParameter("limit")).thenReturn("2"); + command.doPost(request, response); + TestUtils.assertEqualsAsJson(writer.toString(), rowJson); + } + + @Test + public void testJsonOutputRowsEnd() throws ServletException, IOException { + String rowJson = "{\n" + + " \"filtered\" : 5,\n" + + " \"limit\" : 1,\n" + + " \"mode\" : \"row-based\",\n" + + " \"rows\" : [ {\n" + + " \"cells\" : [ null, {\n" + + " \"v\" : \"c\"\n" + + " } ],\n" + + " \"flagged\" : false,\n" + + " \"i\" : 1,\n" + + " \"k\" : 1,\n" + + " \"starred\" : false\n" + + " } ],\n" + + " \"end\" : 2,\n" + + " \"previousPageId\": 1,\n" + + " \"nextPageId\": 2,\n" + + " \"total\" : 5,\n" + + " \"totalRows\" : 5,\n" + + " \"processed\": 5\n" + + " }"; + + when(request.getParameter("engine")).thenReturn("{\"mode\":\"row-based\",\"facets\":[]}"); + when(request.getParameter("end")).thenReturn("2"); + when(request.getParameter("limit")).thenReturn("1"); + command.doPost(request, response); + TestUtils.assertEqualsAsJson(writer.toString(), rowJson); + } + + @Test + public void testJsonOutputRowsEndWithNoPreviousPage() throws ServletException, IOException { + String rowJson = "{\n" + + " \"filtered\" : 5,\n" + + " \"limit\" : 2,\n" + + " \"mode\" : \"row-based\",\n" + + " \"rows\" : [ {" + + " \"cells\": [ {\n" + + " \"v\" : \"a\"\n" + + " }, {\n" + + " \"v\" : \"b\"\n" + + " } ],\n" + + " \"flagged\" : false,\n" + + " \"i\" : 0,\n" + + " \"k\" : 0,\n" + + " \"starred\" : false\n" + + " }, {\n" + + " \"cells\" : [ null, {\n" + + " \"v\" : \"c\"\n" + + " } ],\n" + + " \"flagged\" : false,\n" + + " \"i\" : 1,\n" + + " \"k\" : 1,\n" + + " \"starred\" : false\n" + + " } ],\n" + + " \"end\" : 2,\n" + + " \"nextPageId\": 2,\n" + + " \"total\" : 5,\n" + + " \"totalRows\" : 5,\n" + + " \"processed\": 5\n" + + " }"; + + when(request.getParameter("engine")).thenReturn("{\"mode\":\"row-based\",\"facets\":[]}"); + when(request.getParameter("end")).thenReturn("2"); + when(request.getParameter("limit")).thenReturn("3"); + command.doPost(request, response); + TestUtils.assertEqualsAsJson(writer.toString(), rowJson); } @Test @@ -117,14 +230,18 @@ public void testAggregationLimitRowsNoFacet() throws ServletException, IOExcepti " } ],\n" + " \"flagged\" : false,\n" + " \"i\" : 0,\n" + + " \"k\" : 0,\n" + " \"starred\" : false\n" + " } ],\n" + " \"start\" : 0,\n" + + " \"nextPageId\": 1,\n" + " \"total\" : 5,\n" + + " \"totalRows\" : 5,\n" + " \"processed\": 5\n" + " }"; when(request.getParameter("engine")).thenReturn("{\"mode\":\"row-based\",\"facets\":[],\"aggregationLimit\":2}"); + when(request.getParameter("start")).thenReturn("0"); when(request.getParameter("limit")).thenReturn("1"); command.doPost(request, response); TestUtils.assertEqualsAsJson(writer.toString(), rowJson); @@ -151,14 +268,18 @@ public void testAggregationLimitRowsFacet() throws ServletException, IOException " } ],\n" + " \"flagged\" : false,\n" + " \"i\" : 0,\n" + + " \"k\" : 0,\n" + " \"starred\" : false\n" + " } ],\n" + " \"start\" : 0,\n" + + " \"nextPageId\": 1,\n" + " \"total\" : 5,\n" + + " \"totalRows\" : 5,\n" + " \"processed\": 2\n" + " }"; when(request.getParameter("engine")).thenReturn(engineConfig); + when(request.getParameter("start")).thenReturn("0"); when(request.getParameter("limit")).thenReturn("1"); command.doPost(request, response); @@ -180,6 +301,7 @@ public void testJsonOutputRecords() throws ServletException, IOException { " \"flagged\" : false,\n" + " \"i\" : 0,\n" + " \"j\" : 0,\n" + + " \"k\" : 0,\n" + " \"starred\" : false\n" + " }, {\n" + " \"cells\" : [ null, {\n" + @@ -187,14 +309,18 @@ public void testJsonOutputRecords() throws ServletException, IOException { " } ],\n" + " \"flagged\" : false,\n" + " \"i\" : 1,\n" + + " \"k\" : 1,\n" + " \"starred\" : false\n" + " } ],\n" + " \"start\" : 0,\n" + + " \"nextPageId\": 2,\n" + " \"total\" : 3,\n" + + " \"totalRows\" : 5,\n" + " \"processed\": 3\n" + " }"; when(request.getParameter("engine")).thenReturn("{\"mode\":\"record-based\",\"facets\":[]}"); + when(request.getParameter("start")).thenReturn("0"); when(request.getParameter("limit")).thenReturn("1"); command.doPost(request, response); TestUtils.assertEqualAsJson(recordJson, writer.toString()); @@ -215,6 +341,7 @@ public void testAggregationLimitRecordsNoFacet() throws ServletException, IOExce " \"flagged\" : false,\n" + " \"i\" : 0,\n" + " \"j\" : 0,\n" + + " \"k\" : 0,\n" + " \"starred\" : false\n" + " }, {\n" + " \"cells\" : [ null, {\n" + @@ -222,14 +349,18 @@ public void testAggregationLimitRecordsNoFacet() throws ServletException, IOExce " } ],\n" + " \"flagged\" : false,\n" + " \"i\" : 1,\n" + + " \"k\" : 1,\n" + " \"starred\" : false\n" + " } ],\n" + " \"start\" : 0,\n" + + " \"nextPageId\": 2,\n" + " \"total\" : 3,\n" + + " \"totalRows\" : 5,\n" + " \"processed\": 3\n" + " }"; when(request.getParameter("engine")).thenReturn("{\"mode\":\"record-based\",\"facets\":[],\"aggregationLimit\":2}"); + when(request.getParameter("start")).thenReturn("0"); when(request.getParameter("limit")).thenReturn("1"); command.doPost(request, response); TestUtils.assertEqualsAsJson(writer.toString(), recordJson); @@ -257,6 +388,7 @@ public void testAggregationLimitRecordsFacet() throws ServletException, IOExcept " \"flagged\" : false,\n" + " \"i\" : 0,\n" + " \"j\" : 0,\n" + + " \"k\" : 0," + " \"starred\" : false\n" + " }, {\n" + " \"cells\" : [ null, {\n" + @@ -264,14 +396,18 @@ public void testAggregationLimitRecordsFacet() throws ServletException, IOExcept " } ],\n" + " \"flagged\" : false,\n" + " \"i\" : 1,\n" + + " \"k\" : 1,\n" + " \"starred\" : false\n" + " } ],\n" + " \"start\" : 0,\n" + + " \"nextPageId\": 2,\n" + " \"total\" : 3,\n" + + " \"totalRows\" : 5,\n" + " \"processed\": 2\n" + " }"; when(request.getParameter("engine")).thenReturn(engineConfig); + when(request.getParameter("start")).thenReturn("0"); when(request.getParameter("limit")).thenReturn("1"); command.doPost(request, response); TestUtils.assertEqualsAsJson(writer.toString(), recordJson); diff --git a/main/tests/cypress/cypress/integration/project/grid/viewpanel-header/pagination.spec.js b/main/tests/cypress/cypress/integration/project/grid/viewpanel-header/pagination.spec.js index b5f25d21eec4..cc035b959767 100644 --- a/main/tests/cypress/cypress/integration/project/grid/viewpanel-header/pagination.spec.js +++ b/main/tests/cypress/cypress/integration/project/grid/viewpanel-header/pagination.spec.js @@ -27,7 +27,7 @@ describe(__filename, function () { it('Test the "next" button', function () { cy.loadAndVisitProject('food.small'); cy.get('.viewpanel-paging').find('a').contains('next').click(); - cy.get('#viewpanel-paging-current-input').should('have.value', 2); + cy.get('#viewpanel-paging-current-min-row').should('have.value', 11); cy.assertCellEquals(0, 'Shrt_Desc', 'CHEESE,COLBY'); cy.assertCellEquals(9, 'Shrt_Desc', 'CHEESE,FONTINA'); }); @@ -37,11 +37,11 @@ describe(__filename, function () { // First go next cy.get('.viewpanel-paging').find('a').contains('next').click(); - cy.get('#viewpanel-paging-current-input').should('have.value', 2); + cy.get('#viewpanel-paging-current-min-row').should('have.value', 11); // Then test the previous button cy.get('.viewpanel-paging').find('a').contains('previous').click(); - cy.get('#viewpanel-paging-current-input').should('have.value', 1); + cy.get('#viewpanel-paging-current-min-row').should('have.value', 1); cy.assertCellEquals(0, 'Shrt_Desc', 'BUTTER,WITH SALT'); cy.assertCellEquals(9, 'Shrt_Desc', 'CHEESE,CHESHIRE'); }); @@ -50,9 +50,9 @@ describe(__filename, function () { cy.loadAndVisitProject('food.small'); cy.get('.viewpanel-paging').find('a').contains('last').click(); - cy.get('#viewpanel-paging-current-input').should('have.value', 20); - cy.assertCellEquals(0, 'Shrt_Desc', 'SPICES,BASIL,DRIED'); - cy.assertCellEquals(8, 'Shrt_Desc', 'CLOVES,GROUND'); + cy.get('#viewpanel-paging-current-min-row').should('have.value', 190); + cy.assertCellEquals(0, 'Shrt_Desc', 'ANISE SEED'); + cy.assertCellEquals(9, 'Shrt_Desc', 'CLOVES,GROUND'); }); it('Test the "first" button', function () { @@ -60,11 +60,11 @@ describe(__filename, function () { // First go next cy.get('.viewpanel-paging').find('a').contains('next').click(); - cy.get('#viewpanel-paging-current-input').should('have.value', 2); + cy.get('#viewpanel-paging-current-min-row').should('have.value', 11); - // Then test the previous button + // Then test the First button cy.get('.viewpanel-paging').find('a').contains('first').click(); - cy.get('#viewpanel-paging-current-input').should('have.value', 1); + cy.get('#viewpanel-paging-current-min-row').should('have.value', 1); cy.assertCellEquals(0, 'Shrt_Desc', 'BUTTER,WITH SALT'); cy.assertCellEquals(9, 'Shrt_Desc', 'CHEESE,CHESHIRE'); }); @@ -72,8 +72,8 @@ describe(__filename, function () { it('Test entering an arbitrary page number', function () { cy.loadAndVisitProject('food.small'); - cy.get('#viewpanel-paging-current-input').type('{backspace}2{enter}'); - cy.get('#viewpanel-paging-current-input').should('have.value', 2); + cy.get('#viewpanel-paging-current-min-row').type('{backspace}11{enter}'); + cy.get('#viewpanel-paging-current-min-row').should('have.value', 11); cy.assertCellEquals(0, 'Shrt_Desc', 'CHEESE,COLBY'); cy.assertCellEquals(9, 'Shrt_Desc', 'CHEESE,FONTINA'); }); diff --git a/main/webapp/modules/core/langs/translation-en.json b/main/webapp/modules/core/langs/translation-en.json index 9fc846844eee..dc557eafdb89 100644 --- a/main/webapp/modules/core/langs/translation-en.json +++ b/main/webapp/modules/core/langs/translation-en.json @@ -633,7 +633,6 @@ "core-views/move-to-left": "Move column left", "core-views/move-to-right": "Move column right", "core-views/show-as": "Show as", - "core-views/goto-page": "$1 of $2 {{plural:$2|page|pages}}", "core-views/first": "first", "core-views/previous": "previous", "core-views/next": "next", diff --git a/main/webapp/modules/core/langs/translation-es.json b/main/webapp/modules/core/langs/translation-es.json index cb48f63b85a2..5dae203cd104 100644 --- a/main/webapp/modules/core/langs/translation-es.json +++ b/main/webapp/modules/core/langs/translation-es.json @@ -522,7 +522,6 @@ "core-views/to-text/header": "A texto", "core-views/to-text/single": "A texto", "core-views/reg-exp": "expresión regular", - "core-views/goto-page": "$1 de $2 {{plural:$2|página|paginas}}", "core-views/word-facet": "Faceta por palabra", "core-views/collapse-left": "Contraer todas las columnas a la izquierda", "core-views/clear-recon": "Quitar la información de cotejo", diff --git a/main/webapp/modules/core/langs/translation-fr.json b/main/webapp/modules/core/langs/translation-fr.json index fe4587801ae2..c2645c301b7f 100644 --- a/main/webapp/modules/core/langs/translation-fr.json +++ b/main/webapp/modules/core/langs/translation-fr.json @@ -536,7 +536,6 @@ "core-views/to-text": "En texte…", "core-views/to-text/header": "En texte", "core-views/to-text/single": "En texte", - "core-views/goto-page": "$1 de $2 {{plural:$2|page|pages}}", "core-views/first": "première", "core-views/word-facet": "Facette par mot", "core-views/check-format": "Merci de vérifier le format du fichier.", diff --git a/main/webapp/modules/core/scripts/dialogs/clustering-dialog.js b/main/webapp/modules/core/scripts/dialogs/clustering-dialog.js index 2e0bbf84623f..7f1be03b65e2 100644 --- a/main/webapp/modules/core/scripts/dialogs/clustering-dialog.js +++ b/main/webapp/modules/core/scripts/dialogs/clustering-dialog.js @@ -424,7 +424,7 @@ ClusteringDialog.prototype._apply = function(onDone) { expression: this._expression, edits: JSON.stringify(edits) }, - { cellsChanged: true }, + { cellsChanged: true, rowIdsPreserved: true, recordIdsPreserved: true }, { onError: function(o) { alert("Error: " + o.message); diff --git a/main/webapp/modules/core/scripts/dialogs/column-reordering-dialog.js b/main/webapp/modules/core/scripts/dialogs/column-reordering-dialog.js index a0a03de59e4d..83310818b20f 100644 --- a/main/webapp/modules/core/scripts/dialogs/column-reordering-dialog.js +++ b/main/webapp/modules/core/scripts/dialogs/column-reordering-dialog.js @@ -92,7 +92,7 @@ ColumnReorderingDialog.prototype._commit = function() { "reorder-columns", null, { "columnNames" : JSON.stringify(columnNames) }, - { modelsChanged: true }, + { modelsChanged: true, rowIdsPreserved: true }, // TODO could add recordIdsPreserved: true if the record key column did not change { includeEngine: false } ); diff --git a/main/webapp/modules/core/scripts/dialogs/common-transform-dialog.js b/main/webapp/modules/core/scripts/dialogs/common-transform-dialog.js index 27d88ae4261d..1632078c7893 100644 --- a/main/webapp/modules/core/scripts/dialogs/common-transform-dialog.js +++ b/main/webapp/modules/core/scripts/dialogs/common-transform-dialog.js @@ -117,7 +117,7 @@ commonTransformDialog.prototype._commit = function(expression) { repeatCount: repeatCount }, null, - { cellsChanged: true } + { cellsChanged: true, rowIdsPreserved: true } ); }; diff --git a/main/webapp/modules/core/scripts/dialogs/expression-column-dialog.js b/main/webapp/modules/core/scripts/dialogs/expression-column-dialog.js index d6cd053c50c8..ca2334e71233 100644 --- a/main/webapp/modules/core/scripts/dialogs/expression-column-dialog.js +++ b/main/webapp/modules/core/scripts/dialogs/expression-column-dialog.js @@ -10,7 +10,7 @@ var doTextTransform = function(columnName, expression, onError, repeat, repeatCo repeatCount: repeatCount }, null, - { cellsChanged: true } + { cellsChanged: true, rowIdsPreserved: true } ); }; diff --git a/main/webapp/modules/core/scripts/index/default-importing-controller/controller.js b/main/webapp/modules/core/scripts/index/default-importing-controller/controller.js index 25e62bdb84ae..bcb411d4d06c 100644 --- a/main/webapp/modules/core/scripts/index/default-importing-controller/controller.js +++ b/main/webapp/modules/core/scripts/index/default-importing-controller/controller.js @@ -274,7 +274,7 @@ Refine.DefaultImportingController.prototype.getPreviewData = function(callback, result.rowModel = data; callback(result); }, - "jsonp" + "json" ).fail(() => { alert($.i18n('core-index/rows-loading-failed')); }); }, "json" diff --git a/main/webapp/modules/core/scripts/project.js b/main/webapp/modules/core/scripts/project.js index 909ab84d3c5f..fadd90ae0499 100644 --- a/main/webapp/modules/core/scripts/project.js +++ b/main/webapp/modules/core/scripts/project.js @@ -315,8 +315,9 @@ Refine.createUpdateFunction = function(options, onFinallyDone) { pushFunction(Refine.reinitializeProjectData); } if (options.everythingChanged || options.modelsChanged || options.rowsChanged || options.rowMetadataChanged || options.cellsChanged || options.engineChanged) { + var preservePage = options.rowIdsPreserved && (options.recordIdsPreserved || ui.browsingEngine.getMode() === "row-based"); pushFunction(function(onDone) { - ui.dataTableView.update(onDone); + ui.dataTableView.update(onDone, preservePage); }); pushFunction(function(onDone) { ui.browsingEngine.update(onDone); @@ -502,7 +503,11 @@ Refine.columnNameToColumnIndex = function(columnName) { return -1; }; -Refine.fetchRows = function(start, limit, onDone, sorting) { +/* + Fetch rows after or before a given row id. The engine configuration can also + be used to set filters (facets) or switch between rows/records mode. +*/ +Refine.fetchRows = function(paginationOptions, limit, onDone, sorting) { var body = { engine: JSON.stringify(ui.browsingEngine.getJSON()) }; @@ -511,7 +516,7 @@ Refine.fetchRows = function(start, limit, onDone, sorting) { } $.post( - "command/core/get-rows?" + $.param({ project: theProject.id, start: start, limit: limit }), + "command/core/get-rows?" + $.param({ ...paginationOptions, project: theProject.id, limit: limit }), body, function(data) { if(data.code === "error") { diff --git a/main/webapp/modules/core/scripts/reconciliation/standard-service-panel.js b/main/webapp/modules/core/scripts/reconciliation/standard-service-panel.js index 003d1efb123f..df719bfab980 100644 --- a/main/webapp/modules/core/scripts/reconciliation/standard-service-panel.js +++ b/main/webapp/modules/core/scripts/reconciliation/standard-service-panel.js @@ -337,7 +337,7 @@ ReconStandardServicePanel.prototype.start = function() { limit: parseInt(this._elmts.maxCandidates[0].value) || 0 }) }, - { cellsChanged: true, columnStatsChanged: true } + { cellsChanged: true, columnStatsChanged: true, rowIdsPreserved: true, recordIdsPreserved: true } ); }; diff --git a/main/webapp/modules/core/scripts/views/data-table/cell-ui.js b/main/webapp/modules/core/scripts/views/data-table/cell-ui.js index 2bc43a77348c..8541aa53e435 100644 --- a/main/webapp/modules/core/scripts/views/data-table/cell-ui.js +++ b/main/webapp/modules/core/scripts/views/data-table/cell-ui.js @@ -493,7 +493,7 @@ DataTableCellUI.prototype._postProcessSeveralCells = function(command, params, b command, params, bodyParams, - { cellsChanged: true, columnStatsChanged: columnStatsChanged } + { cellsChanged: true, columnStatsChanged: columnStatsChanged, rowIdsPreserved: true, recordIdsPreserved: true } ); }; diff --git a/main/webapp/modules/core/scripts/views/data-table/data-table-view.js b/main/webapp/modules/core/scripts/views/data-table/data-table-view.js index 1bc65869805f..e5a53635b1c2 100644 --- a/main/webapp/modules/core/scripts/views/data-table/data-table-view.js +++ b/main/webapp/modules/core/scripts/views/data-table/data-table-view.js @@ -44,10 +44,7 @@ function DataTableView(div) { this._columnHeaderUIs = []; this._shownulls = false; - this._currentPageNumber = 1; - this._showRows(0); - - this._refocusPageInput = false; + this._showRows({start: 0}); } DataTableView._extenders = []; @@ -84,9 +81,16 @@ DataTableView.prototype.resize = function() { tableContainer.height((tableContainerIntendedHeight - tableContainerVPadding) + "px"); }; -DataTableView.prototype.update = function(onDone) { - this._currentPageNumber = 1; - this._showRows(0, onDone); +DataTableView.prototype.update = function(onDone, preservePage) { + var paginationOptions = { start: 0 }; + if (preservePage) { + if (theProject.rowModel.start !== undefined) { + paginationOptions.start = theProject.rowModel.start; + } else { + paginationOptions.end = theProject.rowModel.end; + } + } + this._showRows(paginationOptions, onDone); }; DataTableView.prototype.render = function() { @@ -114,7 +118,6 @@ DataTableView.prototype.render = function() { '' ); var elmts = DOM.bind(html); - this._div.empty().append(html); ui.summaryBar.updateResultCount(); @@ -142,6 +145,7 @@ DataTableView.prototype.render = function() { } this._renderDataTables(elmts.table[0], elmts.tableHeader[0]); + this._div.empty().append(html); // show/hide null values in cells $(".data-table-null").toggle(self._shownulls); @@ -167,14 +171,18 @@ DataTableView.prototype._renderSortingControls = function(sortingControls) { DataTableView.prototype._renderPagingControls = function(pageSizeControls, pagingControls) { var self = this; - self._lastPageNumber = Math.floor((theProject.rowModel.filtered - 1) / this._pageSize) + 1; - - var from = (theProject.rowModel.start + 1); - var to = Math.min(theProject.rowModel.filtered, theProject.rowModel.start + theProject.rowModel.limit); + var rowIds = theProject.rowModel.rows.map(row => row.k); + if (theProject.rowModel.start !== undefined) { + rowIds.push(theProject.rowModel.start); + } else { + rowIds.push(theProject.rowModel.end); + } + var minRowId = Math.min(... rowIds); + var maxRowId = Math.max(... rowIds); var firstPage = $('« '+$.i18n('core-views/first')+'').appendTo(pagingControls); var previousPage = $('‹ '+$.i18n('core-views/previous')+'').appendTo(pagingControls); - if (theProject.rowModel.start > 0) { + if (theProject.rowModel.previousPageId !== undefined) { firstPage.addClass("action").on('click',function(evt) { self._onClickFirstPage(this, evt); }); previousPage.addClass("action").on('click',function(evt) { self._onClickPreviousPage(this, evt); }); } else { @@ -182,33 +190,20 @@ DataTableView.prototype._renderPagingControls = function(pageSizeControls, pagin previousPage.addClass("inaction"); } - var pageControlsSpan = $('').attr("id", "viewpanel-paging-current"); - - var pageInputSize = 20 + (8 * ui.dataTableView._lastPageNumber.toString().length); - var currentPageInput = $('') - .on('change',function(evt) { self._onChangeGotoPage(this, evt); }) - .on('keydown',function(evt) { self._onKeyDownGotoPage(this, evt); }) - .attr("id", "viewpanel-paging-current-input") + var minRowInputSize = 20 + (8* theProject.rowModel.total.toString().length); + var minRowInput = $('') + .attr("id", "viewpanel-paging-current-min-row") .attr("min", 1) - .attr("max", self._lastPageNumber) - .attr("required", "required") - .val(self._currentPageNumber) - .css("width", pageInputSize +"px"); - - pageControlsSpan.append($.i18n('core-views/goto-page', '', self._lastPageNumber)); - pageControlsSpan.appendTo(pagingControls); + .attr("max", theProject.rowModel.total) + .css("width", minRowInputSize) + .val(minRowId + 1) + .on('change', function(evt) { self._onChangeMinRow(this, evt); }) + .appendTo(pagingControls); + $('').addClass("viewpanel-pagingcount").html(" - " + (maxRowId + 1) + " ").appendTo(pagingControls); - $('span#currentPageInput').replaceWith($(currentPageInput)); - - if(self._refocusPageInput == true) { - self._refocusPageInput = false; - var currentPageInputForFocus = $('input#viewpanel-paging-current-input'); - currentPageInputForFocus.ready(function(evt) { setTimeout(() => { currentPageInputForFocus.focus(); }, 250); }); - } - var nextPage = $(''+$.i18n('core-views/next')+' ›').appendTo(pagingControls); var lastPage = $(''+$.i18n('core-views/last')+' »').appendTo(pagingControls); - if (theProject.rowModel.start + theProject.rowModel.limit < theProject.rowModel.filtered) { + if (theProject.rowModel.nextPageId) { nextPage.addClass("action").on('click',function(evt) { self._onClickNextPage(this, evt); }); lastPage.addClass("action").on('click',function(evt) { self._onClickLastPage(this, evt); }); } else { @@ -228,7 +223,7 @@ DataTableView.prototype._renderPagingControls = function(pageSizeControls, pagin } else { a.text(pageSize).addClass("action").on('click',function(evt) { self._pageSize = pageSize; - self.update(); + self.update(undefined, true); }); } }; @@ -431,9 +426,9 @@ DataTableView.prototype._renderDataTables = function(table, tableHeader) { } }; -DataTableView.prototype._showRows = function(start, onDone) { +DataTableView.prototype._showRows = function(paginationOptions, onDone) { var self = this; - Refine.fetchRows(start, this._pageSize, function() { + Refine.fetchRows(paginationOptions, this._pageSize, function() { self.render(); if (onDone) { @@ -442,60 +437,33 @@ DataTableView.prototype._showRows = function(start, onDone) { }, this._sorting); }; -DataTableView.prototype._onChangeGotoPage = function(elmt, evt) { - var gotoPageNumber = parseInt($('input#viewpanel-paging-current-input').val()); - - if(typeof gotoPageNumber != "number" || isNaN(gotoPageNumber) || gotoPageNumber == "") { - $('input#viewpanel-paging-current-input').val(this._currentPageNumber); - return; - } - - if(gotoPageNumber > this._lastPageNumber) gotoPageNumber = this._lastPageNumber; - if(gotoPageNumber < 1) gotoPageNumber = 1; - - this._currentPageNumber = gotoPageNumber; - this._showRows((gotoPageNumber - 1) * this._pageSize); -}; - -DataTableView.prototype._onKeyDownGotoPage = function(elmt, evt) { - var keyDownCode = event.which; - - if([38, 40].indexOf(keyDownCode) == -1) return; - if(self._refocusPageInput == true) return; - - evt.preventDefault(); - this._refocusPageInput = true; - - var newPageValue = $('input#viewpanel-paging-current-input')[0].value; - if(keyDownCode == 38) { // Up arrow - if(newPageValue <= 1) return; - this._onClickPreviousPage(elmt, evt); - } - - if(keyDownCode == 40) { // Down arrow - if(newPageValue >= this._lastPageNumber) return; - this._onClickNextPage(elmt, evt); - } -}; - DataTableView.prototype._onClickPreviousPage = function(elmt, evt) { - this._currentPageNumber--; - this._showRows(theProject.rowModel.start - this._pageSize); + this._showRows({end: theProject.rowModel.previousPageId}); }; DataTableView.prototype._onClickNextPage = function(elmt, evt) { - this._currentPageNumber++; - this._showRows(theProject.rowModel.start + this._pageSize); + this._showRows({start: theProject.rowModel.nextPageId}); }; DataTableView.prototype._onClickFirstPage = function(elmt, evt) { - this._currentPageNumber = 1; - this._showRows(0); + this._showRows({start: 0}); }; DataTableView.prototype._onClickLastPage = function(elmt, evt) { - this._currentPageNumber = this._lastPageNumber; - this._showRows((this._lastPageNumber - 1) * this._pageSize); + this._showRows({end: theProject.rowModel.totalRows}); +}; + +DataTableView.prototype._onChangeMinRow = function(elmt, evt) { + var input = $('#viewpanel-paging-current-min-row'); + var newMinRow = input.val(); + if (newMinRow <= 0) { + newMinRow = 1; + input.val(newMinRow); + } else if (newMinRow > theProject.rowModel.total) { + newMinRow = theProject.rowModel.total; + input.val(theProject.rowModel.total); + } + this._showRows({start: newMinRow - 1}); }; DataTableView.prototype._getSortingCriteriaCount = function() { @@ -808,14 +776,14 @@ DataTableView.prototype._createMenuForAllColumns = function(elmt) { label: $.i18n('core-views/star-rows'), id: "core/star-rows", click: function() { - Refine.postCoreProcess("annotate-rows", { "starred" : "true" }, null, { rowMetadataChanged: true }); + Refine.postCoreProcess("annotate-rows", { "starred" : "true" }, null, { rowMetadataChanged: true, rowIdsPreserved: true, recordIdsPreserved: true }); } }, { label: $.i18n('core-views/unstar-rows'), id: "core/unstar-rows", click: function() { - Refine.postCoreProcess("annotate-rows", { "starred" : "false" }, null, { rowMetadataChanged: true }); + Refine.postCoreProcess("annotate-rows", { "starred" : "false" }, null, { rowMetadataChanged: true, rowIdsPreserved: true, recordIdsPreserved: true }); } }, {}, @@ -823,14 +791,14 @@ DataTableView.prototype._createMenuForAllColumns = function(elmt) { label: $.i18n('core-views/flag-rows'), id: "core/flag-rows", click: function() { - Refine.postCoreProcess("annotate-rows", { "flagged" : "true" }, null, { rowMetadataChanged: true }); + Refine.postCoreProcess("annotate-rows", { "flagged" : "true" }, null, { rowMetadataChanged: true, rowIdsPreserved: true, recordIdsPreserved: true }); } }, { label: $.i18n('core-views/unflag-rows'), id: "core/unflag-rows", click: function() { - Refine.postCoreProcess("annotate-rows", { "flagged" : "false" }, null, { rowMetadataChanged: true }); + Refine.postCoreProcess("annotate-rows", { "flagged" : "false" }, null, { rowMetadataChanged: true, rowIdsPreserved: true, recordIdsPreserved: true }); } }, {}, diff --git a/main/webapp/modules/core/scripts/views/data-table/menu-edit-cells.js b/main/webapp/modules/core/scripts/views/data-table/menu-edit-cells.js index 4071c98b11b9..d8d7b23f12a5 100644 --- a/main/webapp/modules/core/scripts/views/data-table/menu-edit-cells.js +++ b/main/webapp/modules/core/scripts/views/data-table/menu-edit-cells.js @@ -102,7 +102,7 @@ DataTableColumnHeaderUI.extendMenu(function(column, columnHeaderUI, menu) { columnName: column.name }, null, - { modelsChanged: true } + { modelsChanged: true, rowIdsPreserved: true } ); }; @@ -113,7 +113,7 @@ DataTableColumnHeaderUI.extendMenu(function(column, columnHeaderUI, menu) { columnName: column.name }, null, - { modelsChanged: true } + { modelsChanged: true, rowIdsPreserved: true } ); }; diff --git a/main/webapp/modules/core/scripts/views/data-table/menu-edit-column.js b/main/webapp/modules/core/scripts/views/data-table/menu-edit-column.js index b9e216c6d163..b79e70086919 100644 --- a/main/webapp/modules/core/scripts/views/data-table/menu-edit-column.js +++ b/main/webapp/modules/core/scripts/views/data-table/menu-edit-column.js @@ -44,7 +44,7 @@ DataTableColumnHeaderUI.extendMenu(function(column, columnHeaderUI, menu) { repeatCount: repeatCount }, { expression: expression }, - { cellsChanged: true }, + { cellsChanged: true, rowIdsPreserved: true }, callbacks ); }; @@ -96,7 +96,7 @@ DataTableColumnHeaderUI.extendMenu(function(column, columnHeaderUI, menu) { onError: $('input[name="create-column-dialog-onerror-choice"]:checked')[0].value }, { expression: previewWidget.getExpression(true) }, - { modelsChanged: true }, + { modelsChanged: true, rowIdsPreserved: true, recordIdsPreserved: true }, { onDone: function(o) { dismiss(); @@ -178,7 +178,7 @@ DataTableColumnHeaderUI.extendMenu(function(column, columnHeaderUI, menu) { httpHeaders: JSON.stringify(elmts.setHttpHeadersContainer.find("input").serializeArray()) }, null, - { modelsChanged: true } + { modelsChanged: true, rowIdsPreserved: true, recordIdsPreserved: true } ); dismiss(); }); @@ -220,7 +220,7 @@ DataTableColumnHeaderUI.extendMenu(function(column, columnHeaderUI, menu) { columnName: column.name }, null, - { modelsChanged: true } + { modelsChanged: true, rowIdsPreserved: true } ); }; @@ -254,7 +254,7 @@ DataTableColumnHeaderUI.extendMenu(function(column, columnHeaderUI, menu) { newColumnName: newColumnName }, null, - {modelsChanged: true}, + {modelsChanged: true, rowIdsPreserved: true, recordIdsPreserved: true}, { onDone: function () { dismiss(); @@ -275,7 +275,7 @@ DataTableColumnHeaderUI.extendMenu(function(column, columnHeaderUI, menu) { index: index }, null, - { modelsChanged: true } + { modelsChanged: true, rowIdsPreserved: true } ); }; @@ -289,7 +289,7 @@ DataTableColumnHeaderUI.extendMenu(function(column, columnHeaderUI, menu) { index: newidx }, null, - { modelsChanged: true } + { modelsChanged: true, rowIdsPreserved: true } ); } }; @@ -426,7 +426,7 @@ DataTableColumnHeaderUI.extendMenu(function(column, columnHeaderUI, menu) { "reorder-columns", null, { "columnNames" : JSON.stringify(columnsToKeep) }, - { modelsChanged: true }, + { modelsChanged: true, rowIdsPreserved: true }, { includeEngine: false } ); } @@ -479,7 +479,7 @@ DataTableColumnHeaderUI.extendMenu(function(column, columnHeaderUI, menu) { onError: onError }, { expression: expression }, - { modelsChanged: true }, + { modelsChanged: true, rowIdsPreserved: true, recordIdsPreserved: true }, { onFinallyDone: deleteColumns} ); } diff --git a/main/webapp/modules/core/scripts/views/data-table/menu-reconcile.js b/main/webapp/modules/core/scripts/views/data-table/menu-reconcile.js index 5ac4d452c30f..114e3805b324 100644 --- a/main/webapp/modules/core/scripts/views/data-table/menu-reconcile.js +++ b/main/webapp/modules/core/scripts/views/data-table/menu-reconcile.js @@ -42,7 +42,7 @@ DataTableColumnHeaderUI.extendMenu(function(column, columnHeaderUI, menu) { "recon-discard-judgments", { columnName: column.name, clearData: false }, null, - { cellsChanged: true, columnStatsChanged: true } + { cellsChanged: true, columnStatsChanged: true, rowIdsPreserved: true, recordIdsPreserved: true } ); }; @@ -51,7 +51,7 @@ DataTableColumnHeaderUI.extendMenu(function(column, columnHeaderUI, menu) { "recon-discard-judgments", { columnName: column.name, clearData: true }, null, - { cellsChanged: true, columnStatsChanged: true } + { cellsChanged: true, columnStatsChanged: true, rowIdsPreserved: true, recordIdsPreserved: true } ); }; @@ -60,7 +60,7 @@ DataTableColumnHeaderUI.extendMenu(function(column, columnHeaderUI, menu) { "recon-match-best-candidates", { columnName: column.name }, null, - { cellsChanged: true, columnStatsChanged: true } + { cellsChanged: true, columnStatsChanged: true, rowIdsPreserved: true, recordIdsPreserved: true } ); }; @@ -78,7 +78,7 @@ DataTableColumnHeaderUI.extendMenu(function(column, columnHeaderUI, menu) { schemaSpace: schemaSpace }, null, - { cellsChanged: true, columnStatsChanged: true } + { cellsChanged: true, columnStatsChanged: true, rowIdsPreserved: true, recordIdsPreserved: true } ); }; @@ -143,7 +143,7 @@ DataTableColumnHeaderUI.extendMenu(function(column, columnHeaderUI, menu) { schemaSpace: service.schemaSpace }, null, - { cellsChanged: true, columnStatsChanged: true } + { cellsChanged: true, columnStatsChanged: true, rowIdsPreserved: true, recordIdsPreserved: true } ); DialogSystem.dismissUntil(level - 1); @@ -231,7 +231,7 @@ DataTableColumnHeaderUI.extendMenu(function(column, columnHeaderUI, menu) { schemaSpace: schemaSpace }, null, - { cellsChanged: true, columnStatsChanged: true } + { cellsChanged: true, columnStatsChanged: true, rowIdsPreserved: true, recordIdsPreserved: true } ); }; @@ -270,7 +270,7 @@ DataTableColumnHeaderUI.extendMenu(function(column, columnHeaderUI, menu) { onError: "set-to-blank" }, { expression: "cell.recon.match.id" }, - { modelsChanged: true }, + { modelsChanged: true, rowIdsPreserved: true, recordIdsPreserved: true }, { onDone: function(o) { dismiss(); @@ -336,7 +336,7 @@ DataTableColumnHeaderUI.extendMenu(function(column, columnHeaderUI, menu) { "recon-copy-across-columns", null, config, - { rowsChanged: true } + { rowsChanged: true, rowIdsPreserved: true, recordIdsPreserved: true } ); dismiss(); } diff --git a/main/webapp/modules/core/styles/views/data-table-view.less b/main/webapp/modules/core/styles/views/data-table-view.less index 819e097553eb..cecf7f9765e7 100644 --- a/main/webapp/modules/core/styles/views/data-table-view.less +++ b/main/webapp/modules/core/styles/views/data-table-view.less @@ -55,45 +55,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. font-weight: bold; } -.viewpanel-rowrecord a { - background-color: @fill_primary; - margin: 0px 1px 0px 1px; - padding: 2px 5px 2px 5px; - .rounded_corners(4px); - } - -.viewpanel-pagesize a { - background-color: @fill_primary; - margin: 0px 2px 0px 2px; - padding: 2px 3px 2px 3px; - .rounded_corners(4px); - } - -div.viewpanel-pagesize span:first-of-type { - margin: 0px 0px 0px 20px; -} - -.viewpanel-paging a { - background-color: @fill_primary; - margin: 0px 4px 0px 4px; - padding: 2px 10px 2px 10px; - .rounded_corners(4px); - } - -.viewpanel-paging span#viewpanel-paging-current { - margin: 0 10px 0 10px; - padding: 0px; - } - -.viewpanel-paging input#viewpanel-paging-current-input { - width: 60px; - margin: 0px; - padding: 0px; - height: 16px; - position: relative; - top: -2px; - } - .data-table-header { width: 1px; position: relative; diff --git a/refine-local-runner/src/main/java/org/openrefine/model/LocalGridState.java b/refine-local-runner/src/main/java/org/openrefine/model/LocalGridState.java index f478e3b58abb..c72407545b32 100644 --- a/refine-local-runner/src/main/java/org/openrefine/model/LocalGridState.java +++ b/refine-local-runner/src/main/java/org/openrefine/model/LocalGridState.java @@ -17,6 +17,7 @@ import com.fasterxml.jackson.annotation.JsonIgnoreType; import com.fasterxml.jackson.core.JsonProcessingException; +import org.apache.commons.collections.IteratorUtils; import org.openrefine.browsing.facets.RecordAggregator; import org.openrefine.browsing.facets.RowAggregator; @@ -49,7 +50,7 @@ public class LocalGridState implements GridState { protected final LocalDatamodelRunner runner; - protected final PairPLL grid; + protected final PairPLL grid; protected final ColumnModel columnModel; protected final Map overlayModels; @@ -60,6 +61,26 @@ public class LocalGridState implements GridState { * Constructs a grid state, supplying all required fields. * * @param runner + * @param columnModel + * @param grid + * @param overlayModels + */ + public LocalGridState( + LocalDatamodelRunner runner, + ColumnModel columnModel, + PairPLL grid, + Map overlayModels) { + this.runner = runner; + this.grid = grid; + this.columnModel = columnModel; + this.overlayModels = overlayModels; + this.records = null; + } + + /** + * Convenience constructor to construct a grid from a PLL of a slightly different type. + * + * @param runner * @param grid * @param columnModel * @param overlayModels @@ -70,7 +91,7 @@ public LocalGridState( ColumnModel columnModel, Map overlayModels) { this.runner = runner; - this.grid = grid; + this.grid = grid.mapToPair(tuple -> new Tuple2<>(tuple.getKey(), new IndexedRow(tuple.getKey(), tuple.getValue()))); this.columnModel = columnModel; this.overlayModels = overlayModels; this.records = null; @@ -84,7 +105,7 @@ protected PairPLL records() { } protected PLL indexedRows() { - return grid.map(tuple -> new IndexedRow(tuple.getKey(), tuple.getValue())); + return grid.values(); } @Override @@ -99,7 +120,7 @@ public ColumnModel getColumnModel() { @Override public GridState withColumnModel(ColumnModel newColumnModel) { - return new LocalGridState(runner, grid, newColumnModel, overlayModels); + return new LocalGridState(runner, newColumnModel, grid, overlayModels); } @Override @@ -108,76 +129,75 @@ public Row getRow(long id) { // it must actually scan at least one entire partition, // which is unnecessary since we know it is sorted and there // is at most one result. So we use `getRange` instead. - List> rows = grid.getRange(id, 1, Comparator.naturalOrder()); + List> rows = grid.getRangeAfter(id, 1, Comparator.naturalOrder()); if (rows.size() == 0) { throw new IndexOutOfBoundsException(String.format("Row id %d not found", id)); } else if (rows.size() > 1) { throw new IllegalStateException(String.format("Found %d rows at index %d", rows.size(), id)); } else { - return rows.get(0).getValue(); + return rows.get(0).getValue().getRow(); } } @Override - public List getRows(long start, int limit) { - return grid.getRange(start, limit, Comparator.naturalOrder()) + public List getRowsAfter(long start, int limit) { + return grid.getRangeAfter(start, limit, Comparator.naturalOrder()) .stream() - .map(tuple -> new IndexedRow(tuple.getKey(), tuple.getValue())) + .map(Tuple2::getValue) .collect(Collectors.toList()); } @Override - public List getRows(RowFilter filter, SortingConfig sortingConfig, long start, int limit) { - PairPLL filteredRows = grid.filter(tuple -> filter.filterRow(tuple.getKey(), tuple.getValue())); - if (SortingConfig.NO_SORTING.equals(sortingConfig)) { - return filteredRows - .getRange(start, limit, Comparator.naturalOrder()) - .stream() - .map(tuple -> new IndexedRow(tuple.getKey(), tuple.getValue())) - .collect(Collectors.toList()); - } else { - RowSorter rowSorter = new RowSorter(this, sortingConfig); - return filteredRows - .map(tuple -> new IndexedRow(tuple.getKey(), tuple.getValue())) - .sort(rowSorter) - .zipWithIndex() // this is cheap because sort loads everything in memory anyway - .getRange(start, limit, Comparator.naturalOrder()) - .stream() - .map(tuple -> tuple.getValue()) - .collect(Collectors.toList()); - } + public List getRowsAfter(RowFilter filter, long start, int limit) { + PairPLL filteredRows = grid + .filter(tuple -> filter.filterRow(tuple.getValue().getLogicalIndex(), tuple.getValue().getRow())); + return filteredRows + .getRangeAfter(start, limit, Comparator.naturalOrder()) + .stream() + .map(Tuple2::getValue) + .collect(Collectors.toList()); + } + + @Override + public List getRowsBefore(long end, int limit) { + return grid.getRangeBefore(end, limit, Comparator.naturalOrder()) + .stream() + .map(Tuple2::getValue) + .collect(Collectors.toList()); + } + + @Override + public List getRowsBefore(RowFilter filter, long end, int limit) { + return grid + .filter(tuple -> filter.filterRow(tuple.getValue().getLogicalIndex(), tuple.getValue().getRow())) + .getRangeBefore(end, limit, Comparator.naturalOrder()) + .stream() + .map(Tuple2::getValue) + .collect(Collectors.toList()); } @Override public List getRows(List rowIndices) { Map results = grid.getByKeys(rowIndices.stream().collect(Collectors.toSet())) .stream() - .collect(Collectors.toMap(t -> t.getKey(), t -> new IndexedRow(t.getKey(), t.getValue()))); + .collect(Collectors.toMap(t -> t.getKey(), t -> t.getValue())); return rowIndices.stream() .map(i -> results.get(i)) .collect(Collectors.toList()); } @Override - public Iterable iterateRows(RowFilter filter, SortingConfig sortingConfig) { + public Iterable iterateRows(RowFilter filter) { return new Iterable() { @Override public Iterator iterator() { PLL filteredRows = grid - .filter(tuple -> filter.filterRow(tuple.getKey(), tuple.getValue())) - .map(tuple -> new IndexedRow(tuple.getKey(), tuple.getValue())); - if (SortingConfig.NO_SORTING.equals(sortingConfig)) { - return filteredRows - .stream() - .iterator(); - } else { - RowSorter rowSorter = new RowSorter(LocalGridState.this, sortingConfig); - return filteredRows - .sort(rowSorter) - .stream() - .iterator(); - } + .filter(tuple -> filter.filterRow(tuple.getValue().getLogicalIndex(), tuple.getValue().getRow())) + .values(); + return filteredRows + .stream() + .iterator(); } }; @@ -185,7 +205,7 @@ public Iterator iterator() { @Override public long countMatchingRows(RowFilter filter) { - return grid.filter(tuple -> filter.filterRow(tuple.getKey(), tuple.getValue())).count(); + return grid.filter(tuple -> filter.filterRow(tuple.getValue().getLogicalIndex(), tuple.getValue().getRow())).count(); } @Override @@ -197,7 +217,7 @@ public ApproxCount countMatchingRowsApprox(RowFilter filter, long limit) { .aggregate(initialState, (ac, tuple) -> new ApproxCount( ac.getProcessed() + 1, - ac.getMatched() + (filter.filterRow(tuple.getKey(), tuple.getValue()) ? 1 : 0), + ac.getMatched() + (filter.filterRow(tuple.getValue().getLogicalIndex(), tuple.getValue().getRow()) ? 1 : 0), ac.limitReached() || ac.getProcessed() + 1 == partitionLimit), (ac1, ac2) -> new ApproxCount( ac1.getProcessed() + ac2.getProcessed(), @@ -207,7 +227,7 @@ public ApproxCount countMatchingRowsApprox(RowFilter filter, long limit) { @Override public List collectRows() { - return grid.map(tuple -> new IndexedRow(tuple.getKey(), tuple.getValue())).collect(); + return grid.values().collect(); } @Override @@ -223,45 +243,51 @@ public Record getRecord(long id) { } @Override - public List getRecords(long start, int limit) { + public List getRecordsAfter(long start, int limit) { return records() - .getRange(start, limit, Comparator.naturalOrder()) + .getRangeAfter(start, limit, Comparator.naturalOrder()) .stream() .map(tuple -> tuple.getValue()) .collect(Collectors.toList()); } @Override - public List getRecords(RecordFilter filter, SortingConfig sortingConfig, long start, int limit) { + public List getRecordsAfter(RecordFilter filter, long start, int limit) { PairPLL records = records(); - if (!SortingConfig.NO_SORTING.equals(sortingConfig)) { - RecordSorter recordSorter = new RecordSorter(this, sortingConfig); - records = records() - .values() - .sort(recordSorter) - .zipWithIndex(); - // this zipWithIndex is efficient because sorting fetches everything in - // memory, and it is trivial to zipWithIndex an InMemoryPLL. - } return records .filter(tuple -> filter.filterRecord(tuple.getValue())) - .getRange(start, limit, Comparator.naturalOrder()) + .getRangeAfter(start, limit, Comparator.naturalOrder()) .stream() .map(tuple -> tuple.getValue()) .collect(Collectors.toList()); } @Override - public Iterable iterateRecords(RecordFilter filter, SortingConfig sortingConfig) { + public List getRecordsBefore(long end, int limit) { + return records() + .getRangeBefore(end, limit, Comparator.naturalOrder()) + .stream() + .map(Tuple2::getValue) + .collect(Collectors.toList()); + } + + @Override + public List getRecordsBefore(RecordFilter filter, long end, int limit) { + return records() + .filter(tuple -> filter.filterRecord(tuple.getValue())) + .getRangeBefore(end, limit, Comparator.naturalOrder()) + .stream() + .map(Tuple2::getValue) + .collect(Collectors.toList()); + } + + @Override + public Iterable iterateRecords(RecordFilter filter) { return new Iterable() { @Override public Iterator iterator() { PLL records = records().values(); - if (!SortingConfig.NO_SORTING.equals(sortingConfig)) { - RecordSorter recordSorter = new RecordSorter(LocalGridState.this, sortingConfig); - records = records.sort(recordSorter); - } return records .filter(tuple -> filter.filterRecord(tuple)) .stream() @@ -333,6 +359,7 @@ protected void saveToFile(File file, Optional progressReporter File gridFile = new File(file, GRID_PATH); grid + .values() .map(LocalGridState::serializeIndexedRow) .saveAsTextFile(gridFile.getAbsolutePath(), progressReporter); @@ -348,9 +375,9 @@ protected Metadata getMetadata() { return metadata; } - protected static String serializeIndexedRow(Tuple2 tuple) { + protected static String serializeIndexedRow(IndexedRow indexedRow) { try { - return ParsingUtilities.saveWriter.writeValueAsString(new IndexedRow(tuple.getKey(), tuple.getValue())); + return ParsingUtilities.saveWriter.writeValueAsString(indexedRow); } catch (JsonProcessingException e) { throw new UncheckedIOException(e); } @@ -358,9 +385,11 @@ protected static String serializeIndexedRow(Tuple2 tuple) { @Override public T aggregateRows(RowAggregator aggregator, T initialState) { - return grid.aggregate(initialState, - (s, t) -> aggregator.withRow(s, t.getKey(), t.getValue()), - (s1, s2) -> aggregator.sum(s1, s2)); + return grid + .values() + .aggregate(initialState, + (s, t) -> aggregator.withRow(s, t.getLogicalIndex(), t.getRow()), + (s1, s2) -> aggregator.sum(s1, s2)); } @Override @@ -376,9 +405,10 @@ public PartialAggregation aggregateRowsApprox(RowAgg PartialAggregation initialPartialState = new PartialAggregation(initialState, 0, partitionLimit == 0); return grid .limitPartitions(partitionLimit) + .values() .aggregate(initialPartialState, (s, t) -> new PartialAggregation( - aggregator.withRow(s.getState(), t.getKey(), t.getValue()), + aggregator.withRow(s.getState(), t.getLogicalIndex(), t.getRow()), s.getProcessed() + 1, s.limitReached() || s.getProcessed() + 1 == partitionLimit), (s1, s2) -> new PartialAggregation( @@ -407,18 +437,20 @@ public PartialAggregation aggregateRecordsApprox(Rec @Override public GridState withOverlayModels(Map newOverlayModels) { - return new LocalGridState(runner, grid, columnModel, newOverlayModels); + return new LocalGridState(runner, columnModel, grid, newOverlayModels); } @Override public GridState mapRows(RowMapper mapper, ColumnModel newColumnModel) { - PairPLL newGrid = grid.mapValues((i, r) -> mapper.call(i, r)); + PairPLL newGrid = grid.mapValues((i, r) -> mapper.call(r.getLogicalIndex(), r.getRow())); return new LocalGridState(runner, newGrid, newColumnModel, overlayModels); } @Override public GridState flatMapRows(RowFlatMapper mapper, ColumnModel newColumnModel) { - PairPLL newGrid = grid.flatMap(tuple -> mapper.call(tuple.getKey(), tuple.getValue()).stream()).zipWithIndex(); + PairPLL newGrid = grid.values() + .flatMap(tuple -> mapper.call(tuple.getLogicalIndex(), tuple.getRow()).stream()) + .zipWithIndex(); return new LocalGridState(runner, newGrid, newColumnModel, overlayModels); } @@ -426,9 +458,9 @@ public GridState flatMapRows(RowFlatMapper mapper, ColumnModel newColumnModel) { public GridState mapRows(RowScanMapper mapper, ColumnModel newColumnModel) { PLL> newGrid = grid.scanMap( mapper.unit(), - tuple -> mapper.feed(tuple.getKey(), tuple.getValue()), + tuple -> mapper.feed(tuple.getValue().getLogicalIndex(), tuple.getValue().getRow()), (s1, s2) -> mapper.combine(s1, s2), - (s, tuple) -> Tuple2.of(tuple.getKey(), mapper.map(s, tuple.getKey(), tuple.getValue()))); + (s, tuple) -> Tuple2.of(tuple.getKey(), mapper.map(s, tuple.getValue().getLogicalIndex(), tuple.getValue().getRow()))); PairPLL paired = new PairPLL<>(newGrid, grid.getPartitioner()); return new LocalGridState(runner, paired, newColumnModel, overlayModels); @@ -444,34 +476,51 @@ public GridState mapRecords(RecordMapper mapper, ColumnModel newColumnModel) { } @Override - public GridState reorderRows(SortingConfig sortingConfig) { + public GridState reorderRows(SortingConfig sortingConfig, boolean permanent) { RowSorter rowSorter = new RowSorter(this, sortingConfig); - PairPLL newRows = indexedRows() - .sort(rowSorter) - .map(IndexedRow::getRow) - .zipWithIndex(); - - return new LocalGridState(runner, newRows, columnModel, overlayModels); + if (permanent) { + PairPLL newRows = indexedRows() + .sort(rowSorter) + .map(IndexedRow::getRow) + .zipWithIndex(); + return new LocalGridState(runner, newRows, columnModel, overlayModels); + } else { + PairPLL newRows = indexedRows() + .sort(rowSorter) + .zipWithIndex() + .mapValues((newIndex, indexedRow) -> new IndexedRow(newIndex, indexedRow.getLogicalIndex(), indexedRow.getRow())); + return new LocalGridState(runner, columnModel, newRows, overlayModels); + } } @Override - public GridState reorderRecords(SortingConfig sortingConfig) { + public GridState reorderRecords(SortingConfig sortingConfig, boolean permanent) { RecordSorter recordSorter = new RecordSorter(this, sortingConfig); - PairPLL newRows = records() + PLL sorted = records() .values() - .sort(recordSorter) - .flatMap(record -> record.getRows().stream()) - .zipWithIndex(); - - return new LocalGridState(runner, newRows, columnModel, overlayModels); + .sort(recordSorter); + if (permanent) { + PairPLL newRows = sorted + .flatMap(record -> record.getRows().stream()) + .zipWithIndex(); + return new LocalGridState(runner, newRows, columnModel, overlayModels); + } else { + PLL pll = sorted + .flatMap(record -> IteratorUtils.toList(record.getIndexedRows().iterator()).stream()); + PairPLL newRows = pll + .zipWithIndex() + .mapValues((newIndex, indexedRow) -> new IndexedRow(newIndex, indexedRow.getLogicalIndex(), indexedRow.getRow())); + return new LocalGridState(runner, columnModel, newRows, overlayModels); + } } @Override public GridState removeRows(RowFilter filter) { PairPLL newGrid = grid - .filter(tuple -> !filter.filterRow(tuple.getKey(), tuple.getValue())) .values() - .zipWithIndex(); + .filter(tuple -> !filter.filterRow(tuple.getLogicalIndex(), tuple.getRow())) + .zipWithIndex() + .mapValues((index, indexedRow) -> indexedRow.getRow()); return new LocalGridState(runner, newGrid, columnModel, overlayModels); } @@ -487,13 +536,15 @@ public GridState removeRecords(RecordFilter filter) { @Override public ChangeData mapRows(RowFilter filter, RowChangeDataProducer rowMapper) { - PairPLL filteredGrid = grid.filter(tuple -> filter.filterRow(tuple.getKey(), tuple.getValue())); + PairPLL filteredGrid = grid + .filter(tuple -> filter.filterRow(tuple.getValue().getLogicalIndex(), tuple.getValue().getRow())); PairPLL data; if (rowMapper.getBatchSize() == 1) { data = filteredGrid.mapValues( - (id, row) -> rowMapper.call(id, row)); + (id, row) -> rowMapper.call(row.getLogicalIndex(), row.getRow())); } else { data = filteredGrid + .values() .batchPartitions(rowMapper.getBatchSize()) .flatMap(rowBatch -> applyRowChangeDataMapper(rowMapper, rowBatch)) .mapToPair(indexedData -> indexedData) @@ -506,17 +557,14 @@ public ChangeData mapRows(RowFilter filter, RowChangeDataProducer rowM } protected static Stream> applyRowChangeDataMapper(RowChangeDataProducer rowMapper, - List> rowBatch) { - List changeData = rowMapper.callRowBatch( - rowBatch.stream() - .map(tuple -> new IndexedRow(tuple.getKey(), tuple.getValue())) - .collect(Collectors.toList())); + List rowBatch) { + List changeData = rowMapper.callRowBatch(rowBatch); if (changeData.size() != rowBatch.size()) { throw new IllegalStateException( String.format("Change data producer returned %d results on a batch of %d rows", changeData.size(), rowBatch.size())); } return IntStream.range(0, rowBatch.size()) - .mapToObj(i -> new Tuple2(rowBatch.get(i).getKey(), changeData.get(i))); + .mapToObj(i -> new Tuple2(rowBatch.get(i).getIndex(), changeData.get(i))); } @Override @@ -565,14 +613,16 @@ public GridState dropRows(long rowsToDrop) { // because they would be computed anyway to re-index // the rows after the drop. grid.getPartitionSizes(); - PairPLL dropped = grid.dropFirstElements(rowsToDrop); + PairPLL dropped = grid.dropFirstElements(rowsToDrop); Optional> partitioner = dropped.getPartitioner(); if (partitioner.isPresent() && partitioner.get() instanceof LongRangePartitioner) { partitioner = partitioner.map(p -> ((LongRangePartitioner) p).shiftKeys(-rowsToDrop)); } - PairPLL shifted = dropped.mapToPair(tuple -> Tuple2.of(tuple.getKey() - rowsToDrop, tuple.getValue())) + PairPLL shifted = dropped + .mapToPair(tuple -> Tuple2.of(tuple.getKey() - rowsToDrop, + new IndexedRow(tuple.getKey() - rowsToDrop, tuple.getValue().getRow()))) .withPartitioner(partitioner); - return new LocalGridState(runner, shifted, columnModel, overlayModels); + return new LocalGridState(runner, columnModel, shifted, overlayModels); } protected static Stream> applyRecordChangeDataMapper( @@ -598,7 +648,7 @@ public GridState join(ChangeData changeData, RowChangeDataJoiner rowJo } PairPLL joined = grid .outerJoinOrdered(((LocalChangeData) changeData).getPLL(), Comparator.naturalOrder()) - .mapValues((id, tuple) -> rowJoiner.call(id, tuple.getKey(), tuple.getValue())); + .mapValues((id, tuple) -> rowJoiner.call(id, tuple.getKey().getRow(), tuple.getValue())); return new LocalGridState(runner, joined, newColumnModel, overlayModels); } @@ -610,7 +660,7 @@ public GridState join(ChangeData changeData, RowChangeDataFlatJoiner r } PairPLL joined = grid .outerJoinOrdered(((LocalChangeData) changeData).getPLL(), Comparator.naturalOrder()) - .flatMap(tuple -> rowJoiner.call(tuple.getKey(), tuple.getValue().getKey(), tuple.getValue().getValue()).stream()) + .flatMap(tuple -> rowJoiner.call(tuple.getKey(), tuple.getValue().getKey().getRow(), tuple.getValue().getValue()).stream()) .zipWithIndex(); return new LocalGridState(runner, joined, newColumnModel, overlayModels); } @@ -639,7 +689,9 @@ public GridState concatenate(GridState other) { mergedOverlayModels.putAll(overlayModels); return new LocalGridState( runner, - grid.values().concatenate(otherLocal.grid.values()).zipWithIndex(), + grid.values() + .map(IndexedRow::getRow) + .concatenate(otherLocal.grid.values().map(IndexedRow::getRow)).zipWithIndex(), merged, mergedOverlayModels); } diff --git a/refine-local-runner/src/main/java/org/openrefine/model/local/OrderedJoinPLL.java b/refine-local-runner/src/main/java/org/openrefine/model/local/OrderedJoinPLL.java index ffd8466070db..d03c86eb8aeb 100644 --- a/refine-local-runner/src/main/java/org/openrefine/model/local/OrderedJoinPLL.java +++ b/refine-local-runner/src/main/java/org/openrefine/model/local/OrderedJoinPLL.java @@ -100,6 +100,13 @@ protected Stream>> compute(Partition partition) { Optional upperBound = Optional.empty(); if (partition.getIndex() > 0) { lowerBound = firstKeys.get(partition.getIndex() - 1); + if (lowerBound.isEmpty()) { + // This partition is empty on the left side. + // We skip it: for an inner join, the result is clearly empty, + // and for an outer join the corresponding elements on the right-hand side + // are added to the joins of the neighbouring partitions. + return Stream.empty(); + } } if (partition.getIndex() < numPartitions() - 1) { upperBound = upperBounds.get(numPartitions() - 2 - partition.getIndex()); diff --git a/refine-local-runner/src/main/java/org/openrefine/model/local/PairPLL.java b/refine-local-runner/src/main/java/org/openrefine/model/local/PairPLL.java index 321b902ac26d..85b4cd916d9d 100644 --- a/refine-local-runner/src/main/java/org/openrefine/model/local/PairPLL.java +++ b/refine-local-runner/src/main/java/org/openrefine/model/local/PairPLL.java @@ -1,15 +1,7 @@ package org.openrefine.model.local; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.Set; +import java.util.*; import java.util.function.BiFunction; import java.util.function.Predicate; import java.util.stream.Collectors; @@ -128,7 +120,7 @@ public List get(K key) { /** * Returns the list of elements starting at the given key and for up to the given number of elements. This assumes - * that the PLL is sorted by keys. + * that the PLL is sorted by keys. * * @param from * the first key to return @@ -138,12 +130,67 @@ public List get(K key) { * the ordering on K that is assumed on the PLL * @return */ - public List> getRange(K from, int limit, Comparator comparator) { + public List> getRangeAfter(K from, int limit, Comparator comparator) { Stream> stream = streamFromKey(from, comparator); return stream .limit(limit).collect(Collectors.toList()); } + /** + * Returns the list of elements ending at the given key (excluded) and for up to the given number of elements. This + * assumes that the PLL is sorted by keys. + * + * @param upperBound + * the least key not to return + * @param limit + * the maximum number of elements to return + * @param comparator + * the ordering on K that is assumed on the PLL + * @return + * + * TODO this could be optimized further, when the partitions are cached in memory, we can iterate from them + * in reverse + */ + public List> getRangeBefore(K upperBound, int limit, Comparator comparator) { + Stream> stream; + if (partitioner.isEmpty() || !(partitioner.get() instanceof LongRangePartitioner)) { + // we resort to simple scanning of all partitions + return gatherElementsBefore(upperBound, limit, stream(), comparator); + } else { + // we can use the partitioner to locate the partition to end at + int endPartition = partitioner.get().getPartition(upperBound); + List> result = new ArrayList<>(limit); + for (int currentPartition = endPartition; currentPartition >= 0 && result.size() != limit; currentPartition--) { + List> lastElements = gatherElementsBefore(upperBound, limit, iterate(getPartitions().get(currentPartition)), + comparator); + for (int i = lastElements.size() - 1; i >= 0 && result.size() < limit; i--) { + result.add(lastElements.get(i)); + } + } + Collections.reverse(result); + return result; + } + } + + /** + * Returns the last n elements whose key is strictly less than the supplied upper bound. + * + * @param stream + * the stream to take the elements from, which is assumed to be in increasing order + */ + protected static List> gatherElementsBefore(K upperBound, int limit, Stream> stream, + Comparator comparator) { + Deque> lastElements = new ArrayDeque<>(limit); + stream.takeWhile(tuple -> comparator.compare(upperBound, tuple.getKey()) > 0) + .forEach(tuple -> { + if (lastElements.size() == limit) { + lastElements.removeFirst(); + } + lastElements.addLast(tuple); + }); + return new ArrayList<>(lastElements); + } + /** * Returns the list of elements whose keys match one of the supplied keys. * @@ -155,7 +202,7 @@ public List> getByKeys(Set keys) { if (partitioner.isEmpty() || !(partitioner.get() instanceof LongRangePartitioner)) { return this.filter(t -> keys.contains(t.getKey())).collect(); } else { - // if the PLL is sorted by keys then we can only scan the partitions + // if the PLL is sorted by keys then we can only scan the partitions // where the keys would go, and stop scanning those partitions as soon // as a greater element is found LongRangePartitioner sortedPartitioner = (LongRangePartitioner) partitioner.get(); @@ -198,7 +245,7 @@ public List> getByKeys(Set keys) { * the first key to start iterating from * @param comparator * the order used to compare the keys - * @return + * @return a streams which starts on the first element whose key is greater or equal to the provided one */ public Stream> streamFromKey(K from, Comparator comparator) { Stream> stream; @@ -438,46 +485,4 @@ public PairPLL> outerJoinOrdered(PairPLL other, Compar return new PairPLL>(joined, joined.getPartitioner()); } - /* - * public void saveAsHadoopFile(String path, Class keyClass, Class valueClass, OutputFormat - * outputFormat, Class codec) throws IOException { Job job = - * Job.getInstance(context.getFileSystem().getConf()); job.setOutputKeyClass(keyClass); - * job.setOutputValueClass(valueClass); job.setOutputFormatClass(outputFormat.getClass()); Configuration jobConf = - * job.getConfiguration(); jobConf.set("mapreduce.output.fileoutputformat.outputdir", path); if (codec != null) { - * jobConf.set("mapreduce.output.fileoutputformat.compress", "true"); - * jobConf.set("mapreduce.output.fileoutputformat.compress.codec", codec.getCanonicalName()); - * jobConf.set("mapreduce.output.fileoutputformat.compress.type", CompressionType.BLOCK.toString()); } - * - * // Create the committer OutputCommitter committer = outputFormat.getOutputCommitter(attemptContext); JobContext - * jobContext; Configuration jobContextConfig = jobContext.getConfiguration(); - * jobContextConfig.set("mapreduce.job.id", jobId.toString()); jobContextConfig.set("mapreduce.task.id", - * attemptId.getTaskID().toString()); jobContextConfig.set("mapreduce.task.attempt.id", attemptId.toString()); - * jobContextConfig.setBoolean("mapreduce.task.ismap", true); committer.setupJob(jobContext); - * - * runOnPartitions(writePartition(jobConf, outputFormat.getClass())); } - * - * protected Function writePartition(Configuration jobConf, Class class1) { - * return (partition -> { // Derive the filename used to serialize this partition NumberFormat formatter = - * NumberFormat.getInstance(Locale.US); formatter.setMinimumIntegerDigits(5); formatter.setGroupingUsed(false); - * String outputName = "part-" + formatter.format(partition.getIndex()); - * - * // Create the task attempt context int jobId = 0; TaskAttemptID attemptId = new TaskAttemptID("", jobId, - * TaskType.REDUCE, partition.getIndex(), 0); Configuration attemptConf = new Configuration(jobConf); - * attemptConf.setInt("mapreduce.task.partition", partition.getIndex()); TaskAttemptContext attemptContext = new - * TaskAttemptContextImpl(attemptConf, attemptId); - * - * try { // Instantiate the output format OutputFormat outputFormat = class1.newInstance(); if (outputFormat - * instanceof Configurable) { ((Configurable) outputFormat).setConf(attemptConf); } - * - * - * - * - * // Initialize the writer RecordWriter writer = outputFormat.getRecordWriter(attemptContext); - * iterate(partition).forEach(tuple -> { try { writer.write(tuple.getKey(), tuple.getValue()); } catch (IOException - * | InterruptedException e) { throw new UncheckedExecutionException(e); } finally { try { - * writer.close(attemptContext); } catch (IOException | InterruptedException e) { throw new - * UncheckedExecutionException(e); } } }); } catch (IOException | InterruptedException | InstantiationException | - * IllegalAccessException e) { throw new UncheckedExecutionException(e); } return 0L; }); } - */ - } diff --git a/refine-local-runner/src/main/java/org/openrefine/model/local/RecordPLL.java b/refine-local-runner/src/main/java/org/openrefine/model/local/RecordPLL.java index 379e9f33f24f..7e4aa0f0a18f 100644 --- a/refine-local-runner/src/main/java/org/openrefine/model/local/RecordPLL.java +++ b/refine-local-runner/src/main/java/org/openrefine/model/local/RecordPLL.java @@ -22,7 +22,7 @@ */ public class RecordPLL extends PLL> { - private final PairPLL parent; + private final PairPLL parent; private List partitions; protected final int keyColumnIndex; @@ -35,7 +35,7 @@ public class RecordPLL extends PLL> { * @param keyColumnIndex * the index of the column used as record key */ - public static PairPLL groupIntoRecords(PairPLL grid, int keyColumnIndex) { + public static PairPLL groupIntoRecords(PairPLL grid, int keyColumnIndex) { return new PairPLL(new RecordPLL(grid, keyColumnIndex), grid.getPartitioner()); } @@ -48,13 +48,15 @@ public static PairPLL groupIntoRecords(PairPLL grid, in * @param keyColumnIndex * the index of the column used as record key */ - public RecordPLL(PairPLL grid, int keyColumnIndex) { + public RecordPLL(PairPLL grid, int keyColumnIndex) { super(grid.getContext()); this.keyColumnIndex = keyColumnIndex; List parentPartitions = grid.getPartitions(); Stream lastPartitions = parentPartitions.stream().skip(1L); - List recordEnds = grid - .runOnPartitionsWithoutInterruption(partition -> extractRecordEnd(grid.iterate(partition), keyColumnIndex), lastPartitions); + PLL indexedRows = grid.values(); + List recordEnds = indexedRows + .runOnPartitionsWithoutInterruption(partition -> extractRecordEnd(indexedRows.iterate(partition), keyColumnIndex), + lastPartitions); parent = grid; partitions = new ArrayList<>(parentPartitions.size()); for (int i = 0; i != parentPartitions.size(); i++) { @@ -76,28 +78,27 @@ public RecordPLL(PairPLL grid, int keyColumnIndex) { } } - protected static RecordEnd extractRecordEnd(Stream> rows, int keyColumnIndex) { + protected static RecordEnd extractRecordEnd(Stream rows, int keyColumnIndex) { // We cannot use Stream.takeWhile here because we need to know if we have reached the end of the stream List end = new ArrayList<>(); - Iterator> iterator = rows.iterator(); - Tuple2 lastTuple = null; + Iterator iterator = rows.iterator(); + IndexedRow lastTuple = null; while (iterator.hasNext()) { lastTuple = iterator.next(); - if (!lastTuple.getValue().isCellBlank(keyColumnIndex)) { + if (!lastTuple.getRow().isCellBlank(keyColumnIndex)) { break; } - end.add(lastTuple.getValue()); + end.add(lastTuple.getRow()); } - return new RecordEnd(end, lastTuple == null || lastTuple.getValue().isCellBlank(keyColumnIndex)); + return new RecordEnd(end, lastTuple == null || lastTuple.getRow().isCellBlank(keyColumnIndex)); } protected static Stream> groupIntoRecords( - Stream> stream, + Stream stream, int keyCellIndex, boolean ignoreFirstRows, List additionalRows) { - Iterator> iterator = stream.iterator(); - Iterator indexedRows = Iterators.transform(iterator, tuple -> new IndexedRow(tuple.getKey(), tuple.getValue())); + Iterator indexedRows = stream.iterator(); Iterator recordIterator = Record.groupIntoRecords(indexedRows, keyCellIndex, ignoreFirstRows, additionalRows); Iterator> indexedRecords = Iterators.transform(recordIterator, record -> Tuple2.of(record.getStartRowId(), record)); @@ -107,7 +108,8 @@ record -> Tuple2.of(record.getStartRowId(), record)); @Override protected Stream> compute(Partition partition) { RecordPartition recordPartition = (RecordPartition) partition; - Stream> rows = parent.iterate(recordPartition.getParent()); + Stream rows = parent.iterate(recordPartition.getParent()) + .map(Tuple2::getValue); Stream> records = groupIntoRecords(rows, keyColumnIndex, partition.getIndex() != 0, recordPartition.additionalRows); return records; diff --git a/refine-local-runner/src/test/java/org/openrefine/model/local/SortedJoinPLLTests.java b/refine-local-runner/src/test/java/org/openrefine/model/local/OrderedJoinPLLTests.java similarity index 90% rename from refine-local-runner/src/test/java/org/openrefine/model/local/SortedJoinPLLTests.java rename to refine-local-runner/src/test/java/org/openrefine/model/local/OrderedJoinPLLTests.java index c57ce95b7302..b68b2cdccfe6 100644 --- a/refine-local-runner/src/test/java/org/openrefine/model/local/SortedJoinPLLTests.java +++ b/refine-local-runner/src/test/java/org/openrefine/model/local/OrderedJoinPLLTests.java @@ -11,7 +11,7 @@ import org.testng.annotations.BeforeTest; import org.testng.annotations.Test; -public class SortedJoinPLLTests extends PLLTestsBase { +public class OrderedJoinPLLTests extends PLLTestsBase { private List> first; private List> second; @@ -105,6 +105,26 @@ public void testOuterJoinWithoutPartitioner() { Tuple2.of(8L, Tuple2.of("you", null)))); } + @Test + public void testOuterJoinWithEmptyPartitions() { + PairPLL firstPLL = parallelize(10, first) + .mapToPair(t -> t); + PairPLL secondPLL = parallelize(10, second) + .mapToPair(t -> t); + + PairPLL> joined = firstPLL.outerJoinOrdered(secondPLL, Comparator.naturalOrder()); + + Assert.assertEquals(joined.collect(), + Arrays.asList( + Tuple2.of(1L, Tuple2.of(null, "one")), + Tuple2.of(2L, Tuple2.of("foo", null)), + Tuple2.of(4L, Tuple2.of("bar", "four")), + Tuple2.of(5L, Tuple2.of("boom", null)), + Tuple2.of(6L, Tuple2.of("hey", "six")), + Tuple2.of(7L, Tuple2.of(null, "seven")), + Tuple2.of(8L, Tuple2.of("you", null)))); + } + @Test public void testMergeInnerEmptyStreams() { assertStreamsEqual( diff --git a/refine-local-runner/src/test/java/org/openrefine/model/local/PairPLLTests.java b/refine-local-runner/src/test/java/org/openrefine/model/local/PairPLLTests.java index 2bb02960e236..9e46667f8ad4 100644 --- a/refine-local-runner/src/test/java/org/openrefine/model/local/PairPLLTests.java +++ b/refine-local-runner/src/test/java/org/openrefine/model/local/PairPLLTests.java @@ -77,44 +77,84 @@ public void testGet() { } @Test - public void testGetRangeWithPartitioner() { + public void testGetRangeAfterWithPartitioner() { PairPLL zipped = parallelize(4, Arrays.asList(10, 11, 12, 13, 14, 15, 16, 17)).zipWithIndex(); - Assert.assertEquals(zipped.getRange(-1L, 2, Comparator.naturalOrder()), + Assert.assertEquals(zipped.getRangeAfter(-1L, 2, Comparator.naturalOrder()), Arrays.asList( Tuple2.of(0L, 10), Tuple2.of(1L, 11))); - Assert.assertEquals(zipped.getRange(7L, 2, Comparator.naturalOrder()), + Assert.assertEquals(zipped.getRangeAfter(7L, 2, Comparator.naturalOrder()), Arrays.asList( Tuple2.of(7L, 17))); - Assert.assertEquals(zipped.getRange(3L, 2, Comparator.naturalOrder()), + Assert.assertEquals(zipped.getRangeAfter(3L, 2, Comparator.naturalOrder()), Arrays.asList( Tuple2.of(3L, 13), Tuple2.of(4L, 14))); } @Test - public void testGetRangeWithoutPartitioner() { + public void testGetRangeAfterWithoutPartitioner() { PairPLL zipped = new PairPLL( parallelize(4, Arrays.asList(10, 11, 12, 13, 14, 15, 16, 17)).zipWithIndex().toPLL(), Optional.empty()); - Assert.assertEquals(zipped.getRange(-1L, 2, Comparator.naturalOrder()), + Assert.assertEquals(zipped.getRangeAfter(-1L, 2, Comparator.naturalOrder()), Arrays.asList( Tuple2.of(0L, 10), Tuple2.of(1L, 11))); - Assert.assertEquals(zipped.getRange(7L, 2, Comparator.naturalOrder()), + Assert.assertEquals(zipped.getRangeAfter(7L, 2, Comparator.naturalOrder()), Arrays.asList( Tuple2.of(7L, 17))); - Assert.assertEquals(zipped.getRange(3L, 2, Comparator.naturalOrder()), + Assert.assertEquals(zipped.getRangeAfter(3L, 2, Comparator.naturalOrder()), Arrays.asList( Tuple2.of(3L, 13), Tuple2.of(4L, 14))); } + @Test + public void testGetRangeBeforeWithPartitioner() { + PairPLL zipped = parallelize(4, Arrays.asList(10, 11, 12, 13, 14, 15, 16, 17)).zipWithIndex(); + + Assert.assertEquals(zipped.getRangeBefore(3L, 2, Comparator.naturalOrder()), + Arrays.asList( + Tuple2.of(1L, 11), + Tuple2.of(2L, 12))); + + Assert.assertEquals(zipped.getRangeBefore(1L, 2, Comparator.naturalOrder()), + Arrays.asList( + Tuple2.of(0L, 10))); + + Assert.assertEquals(zipped.getRangeBefore(9L, 2, Comparator.naturalOrder()), + Arrays.asList( + Tuple2.of(6L, 16), + Tuple2.of(7L, 17))); + } + + @Test + public void testGetRangeBeforeWithoutPartitioner() { + PairPLL zipped = new PairPLL<>( + parallelize(4, Arrays.asList(10, 11, 12, 13, 14, 15, 16, 17)).zipWithIndex().toPLL(), + Optional.empty()); + + Assert.assertEquals(zipped.getRangeBefore(3L, 2, Comparator.naturalOrder()), + Arrays.asList( + Tuple2.of(1L, 11), + Tuple2.of(2L, 12))); + + Assert.assertEquals(zipped.getRangeBefore(1L, 2, Comparator.naturalOrder()), + Arrays.asList( + Tuple2.of(0L, 10))); + + Assert.assertEquals(zipped.getRangeBefore(9L, 2, Comparator.naturalOrder()), + Arrays.asList( + Tuple2.of(6L, 16), + Tuple2.of(7L, 17))); + } + @Test public void testGetByKeysWithPartitioner() { PairPLL zipped = parallelize(4, Arrays.asList(10, 11, 12, 13, 14, 15, 16, 17, 18, 19)).zipWithIndex(); @@ -279,4 +319,23 @@ public void testDropRight() { Assert.assertTrue(dropped.partitioner.isPresent()); PartitionerTestUtils.checkPartitionerAdequacy(dropped.partitioner.get(), dropped); } + + @Test + public void testGatherElementsBefore() { + List> list = Arrays.asList( + Tuple2.of(0L, 10), + Tuple2.of(2L, 12), + Tuple2.of(4L, 14), + Tuple2.of(6L, 16), + Tuple2.of(8L, 18)); + + Assert.assertEquals(PairPLL.gatherElementsBefore(5L, 2, list.stream(), Comparator.naturalOrder()), + list.subList(1, 3)); + Assert.assertEquals(PairPLL.gatherElementsBefore(13L, 2, list.stream(), Comparator.naturalOrder()), + list.subList(3, 5)); + Assert.assertEquals(PairPLL.gatherElementsBefore(1L, 2, list.stream(), Comparator.naturalOrder()), + list.subList(0, 1)); + Assert.assertEquals(PairPLL.gatherElementsBefore(10L, 20, list.stream(), Comparator.naturalOrder()), + list); + } } diff --git a/refine-model/src/main/java/org/openrefine/LookupCacheManager.java b/refine-model/src/main/java/org/openrefine/LookupCacheManager.java index c62a31768b52..e8a2364b2d99 100644 --- a/refine-model/src/main/java/org/openrefine/LookupCacheManager.java +++ b/refine-model/src/main/java/org/openrefine/LookupCacheManager.java @@ -15,7 +15,6 @@ import org.openrefine.model.Project; import org.openrefine.model.Row; import org.openrefine.model.RowFilter; -import org.openrefine.sorting.SortingConfig; import org.openrefine.util.LookupException; /** @@ -97,7 +96,7 @@ static public class ProjectLookup implements Serializable { } // We can't use for-each here, because we'll need the row index when creating WrappedRow - for (IndexedRow indexedRow : grid.iterateRows(RowFilter.ANY_ROW, SortingConfig.NO_SORTING)) { + for (IndexedRow indexedRow : grid.iterateRows(RowFilter.ANY_ROW)) { Row targetRow = indexedRow.getRow(); Object value = targetRow.getCellValue(targetColumnIndex); if (ExpressionUtils.isNonBlankData(value)) { diff --git a/refine-model/src/main/java/org/openrefine/browsing/Engine.java b/refine-model/src/main/java/org/openrefine/browsing/Engine.java index 173ad9fc716a..f8c8b1df18ce 100644 --- a/refine-model/src/main/java/org/openrefine/browsing/Engine.java +++ b/refine-model/src/main/java/org/openrefine/browsing/Engine.java @@ -165,10 +165,19 @@ public boolean limitReached() { */ @JsonIgnore public Iterable getMatchingRows(SortingConfig sortingConfig) { + GridState sorted = _state; + if (!sortingConfig.getCriteria().isEmpty()) { + // TODO refactor this so that we are not re-sorting the grid at every request, but cache it instead? + if (Mode.RowBased.equals(getMode())) { + sorted = _state.reorderRows(sortingConfig, false); + } else { + sorted = _state.reorderRecords(sortingConfig, false); + } + } if (Mode.RowBased.equals(getMode())) { - return _state.iterateRows(combinedRowFilters(), sortingConfig); + return sorted.iterateRows(combinedRowFilters()); } else { - Iterable records = _state.iterateRecords(combinedRecordFilters(), sortingConfig); + Iterable records = sorted.iterateRecords(combinedRecordFilters()); return new Iterable() { @Override @@ -210,7 +219,8 @@ public Iterable getMatchingRecords(SortingConfig sortingConfig) { if (Mode.RowBased.equals(getMode())) { throw new IllegalStateException("Cannot iterate over records in rows mode"); } - return _state.iterateRecords(combinedRecordFilters(), sortingConfig); + // TODO refactor this so that we are not resorting the grid at each request, but cache it instead? + return _state.reorderRecords(sortingConfig, false).iterateRecords(combinedRecordFilters()); } /** diff --git a/refine-model/src/main/java/org/openrefine/model/GridState.java b/refine-model/src/main/java/org/openrefine/model/GridState.java index 1aaec5d194f7..0b460a7e45a1 100644 --- a/refine-model/src/main/java/org/openrefine/model/GridState.java +++ b/refine-model/src/main/java/org/openrefine/model/GridState.java @@ -56,7 +56,8 @@ public interface GridState { * fetching them by batch, depending on the implementation. * * @param id - * the row index + * the row index. This refers to the current position of the row in the grid, which corresponds to + * {@link IndexedRow#getIndex()}. * @return the row at the given index * @throws IndexOutOfBoundsException * if row id could not be found @@ -72,13 +73,56 @@ public interface GridState { * the maximum number of rows to fetch * @return the list of rows with their ids (if any) */ - public List getRows(long start, int limit); + public List getRowsAfter(long start, int limit); + + /** + * Among the subset of filtered rows, return a list of rows, starting from a given index and defined by a maximum + * size. + * + * @param filter + * the subset of rows to paginate through. This object and its dependencies are required to be + * serializable. + * @param start + * the first row id to fetch (inclusive) + * @param limit + * the maximum number of rows to fetch + * @return the list of rows with their ids (if any) + * @see #getRowsBefore(long, int) + */ + public List getRowsAfter(RowFilter filter, long start, int limit); + + /** + * Returns a list of consecutive rows, just before the given row index (not included) and up to a maximum size. + * + * @param end + * the last row id to fetch (exclusive) + * @param limit + * the maximum number of rows to fetch + * @return the list of rows with their ids (if any) + * @see #getRowsAfter(long, int) + */ + public List getRowsBefore(long end, int limit); + + /** + * Among the subset of filtered rows, return a list of rows, just before the row with a given index (excluded) and + * defined by a maximum size. + * + * @param filter + * the subset of rows to paginate through. This object and its dependencies are required to be + * serializable. + * @param end + * the last row id to fetch (exclusive) + * @param limit + * the maximum number of rows to fetch + * @return the list of rows with their ids (if any) + */ + public List getRowsBefore(RowFilter filter, long end, int limit); /** * Returns a list of rows corresponding to the row indices supplied. By default, this calls * {@link GridState#getRow(long)} on all values, but implementations can override this to more efficient strategies * if available. - * + * * @param rowIndices * the indices of the rows to lookup * @return the list contains null values for the row indices which could not be found. @@ -95,23 +139,6 @@ public default List getRows(List rowIndices) { return result; } - /** - * Among the subset of filtered rows, return a list of rows, starting from a given index and defined by a maximum - * size. - * - * @param filter - * the subset of rows to paginate through. This object and its dependencies are required to be - * serializable. - * @param sortingConfig - * the order in which to return the rows - * @param start - * the first row id to fetch (inclusive) - * @param limit - * the maximum number of rows to fetch - * @return the list of rows with their ids (if any) - */ - public List getRows(RowFilter filter, SortingConfig sortingConfig, long start, int limit); - /** * Iterate over rows matched by a filter, in the order determined by a sorting configuration. This might not require * loading all rows in memory at once, but might be less efficient than {@link #collectRows()} if all rows are to be @@ -121,7 +148,7 @@ public default List getRows(List rowIndices) { * leaks with some implementations. This should be clarified by the interface. Consider exposing a closeable * iterable instead. */ - public Iterable iterateRows(RowFilter filter, SortingConfig sortingConfig); + public Iterable iterateRows(RowFilter filter); /** * Count the number of rows which match a given filter. @@ -154,7 +181,8 @@ public default List getRows(List rowIndices) { * inefficient depending on the implementation. * * @param id - * the row id of the first row in the record + * the row id of the first row in the record. This refers to the current position of the record in the + * grid, which corresponds to {@link Record#getStartRowId()}. * @return the corresponding record * @throws IllegalArgumentException * if record id could not be found @@ -169,8 +197,9 @@ public default List getRows(List rowIndices) { * @param limit * the maximum number of records to fetch * @return the list of records (if any) + * @see #getRecordsBefore(long, int) */ - public List getRecords(long start, int limit); + public List getRecordsAfter(long start, int limit); /** * Among the filtered subset of records, returns a list of records, starting from a given index and defined by a @@ -179,22 +208,46 @@ public default List getRows(List rowIndices) { * @param filter * the filter which defines the subset of records to paginate through This object and its dependencies * are required to be serializable. - * @param sortingConfig - * the order in which the rows should be returned * @param start * the first record id to fetch (inclusive) * @param limit * the maximum number of records to fetch * @return the list of records (if any) */ - public List getRecords(RecordFilter filter, SortingConfig sortingConfig, long start, int limit); + public List getRecordsAfter(RecordFilter filter, long start, int limit); + + /** + * Returns a list of consecutive records, ending at a given index (exclusive) and defined by a maximum size. + * + * @param end + * the last record id to fetch (exclusive) + * @param limit + * the maximum number of records to fetch + * @return the list of records (if any) + * @see #getRecordsAfter(long, int) + */ + public List getRecordsBefore(long end, int limit); + + /** + * Among the filtered subset of records, returns a list of records, ending at a given index (exclusive) and defined + * by a maximum size. + * + * @param filter + * the filter which defines the subset of records to paginate through This object and its dependencies + * are required to be serializable. + * @param end + * the last record id to fetch (exclusive) + * @param limit + * the maximum number of records to fetch + * @return the list of records (if any) + */ + public List getRecordsBefore(RecordFilter filter, long end, int limit); /** - * Iterate over records matched by a filter, ordered according to the sorting configuration. This might not require - * loading all records in memory at once, but might be less efficient than {@link #collectRecords()} if all records - * are to be stored in memory downstream. + * Iterate over records matched by a filter. This might not require loading all records in memory at once, but might + * be less efficient than {@link #collectRecords()} if all records are to be stored in memory downstream. */ - public Iterable iterateRecords(RecordFilter filter, SortingConfig sortingConfig); + public Iterable iterateRecords(RecordFilter filter); /** * Return the number of records which are filtered by this filter. @@ -347,18 +400,24 @@ public PartialAggregation aggregateRecordsApprox(Rec * * @param sortingConfig * the criteria to sort rows + * @param permanent + * if true, forget the original row ids. If false, store them in the corresponding + * {@link IndexedRow#getOriginalIndex()}. * @return the resulting grid state */ - public GridState reorderRows(SortingConfig sortingConfig); + public GridState reorderRows(SortingConfig sortingConfig, boolean permanent); /** * Returns a new grid state where records have been reordered according to the configuration supplied. * * @param sortingConfig * the criteria to sort records + * @param permanent + * if true, forget the original record ids. If false, store them in the corresponding + * {@link Record#getOriginalStartRowId()}. * @return the resulting grid state */ - public GridState reorderRecords(SortingConfig sortingConfig); + public GridState reorderRecords(SortingConfig sortingConfig, boolean permanent); /** * Removes all rows selected by a filter diff --git a/refine-model/src/main/java/org/openrefine/model/IndexedRow.java b/refine-model/src/main/java/org/openrefine/model/IndexedRow.java index 771ddb07ba35..199ff8d4c58e 100644 --- a/refine-model/src/main/java/org/openrefine/model/IndexedRow.java +++ b/refine-model/src/main/java/org/openrefine/model/IndexedRow.java @@ -4,11 +4,17 @@ import java.io.Serializable; import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.annotation.JsonProperty; /** * A row with its row index. Equivalent to {@code Tuple2} but serializable and deserializable with Jackson * easily. Serialization keys are kept short to reduce the memory overhead. + *

+ * This class can also hold a so-called original row id, which is used when sorting grids. The original row id is the + * original position of the row before sorting the grid. This is used when the user adds sorting settings in the UI + * without permanently re-ordering the rows yet. * * @author Antonin Delpeuch */ @@ -17,33 +23,68 @@ public class IndexedRow implements Serializable { private static final long serialVersionUID = -8959175171376953548L; private final long _id; + private final long _originalId; private final Row _row; + public IndexedRow( + long id, + Row row) { + _id = id; + _row = row; + _originalId = -1; + } + @JsonCreator public IndexedRow( @JsonProperty("i") long id, + @JsonProperty("o") Long originalId, @JsonProperty("r") Row row) { _id = id; _row = row; + _originalId = originalId == null ? -1 : originalId; } + /** + * The position (0-based) of the row in the grid. + */ @JsonProperty("i") public long getIndex() { return _id; } + /** + * The row + */ @JsonProperty("r") public Row getRow() { return _row; } + /** + * If this grid was obtained by applying a temporary sorting, this returns the original id of the row in the + * unsorted grid. Otherwise, this returns null. + */ + @JsonProperty("o") + @JsonInclude(JsonInclude.Include.NON_NULL) + public Long getOriginalIndex() { + return _originalId == -1 ? null : _originalId; + } + + /** + * The original index of this row, or if it is not set, the actual one. + */ + @JsonIgnore + public long getLogicalIndex() { + return _originalId == -1 ? _id : _originalId; + } + @Override public boolean equals(Object other) { if (!(other instanceof IndexedRow)) { return false; } IndexedRow otherRow = (IndexedRow) other; - return _id == otherRow._id && _row.equals(otherRow._row); + return _id == otherRow._id && _row.equals(otherRow._row) && _originalId == otherRow._originalId; } @Override @@ -53,6 +94,10 @@ public int hashCode() { @Override public String toString() { - return String.format("IndexedRow %d: %s", _id, _row); + if (_originalId == -1) { + return String.format("IndexedRow %d: %s", _id, _row); + } else { + return String.format("IndexRow (%d -> %d): %s", _originalId, _id, _row); + } } } diff --git a/refine-model/src/main/java/org/openrefine/model/Record.java b/refine-model/src/main/java/org/openrefine/model/Record.java index 6a7c1da46e37..2cce9ccc2395 100644 --- a/refine-model/src/main/java/org/openrefine/model/Record.java +++ b/refine-model/src/main/java/org/openrefine/model/Record.java @@ -34,9 +34,7 @@ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT package org.openrefine.model; import java.io.Serializable; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; +import java.util.*; import java.util.stream.Collectors; import java.util.stream.IntStream; @@ -45,6 +43,11 @@ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT /** * A list of consecutive rows where only the first row has a non-blank value in the record key column (normally, the * first column). + *

+ * This can also store the original row id of the first row in the record, if records have been temporarily reordered. + * Note that this assumes that the grid has been temporarily reordered as records, not as rows, so that only the first + * original row id needs to be recorded (so that the original ids of the other rows in the records can be deduced from + * it). * * @author Antonin Delpeuch */ @@ -53,23 +56,60 @@ public class Record implements Serializable { private static final long serialVersionUID = 1547689057610085206L; final private long startRowIndex; + final private Long startRowOriginalIndex; final private List rows; public Record( long startRowIndex, List rows) { this.startRowIndex = startRowIndex; + this.startRowOriginalIndex = null; this.rows = rows; } + public Record( + long startRowIndex, + Long startRowOriginalIndex, + List rows) { + this.startRowIndex = startRowIndex; + this.startRowOriginalIndex = startRowOriginalIndex; + this.rows = rows; + } + + /** + * The id of the first row in this record. + */ public long getStartRowId() { return startRowIndex; } + /** + * The id of the last row in this record. + */ public long getEndRowId() { return startRowIndex + rows.size(); } + /** + * The original id (before applying a temporary sorting operation) of the first row in this record. + *

+ * If no sorting has been applied, this returns null. + */ + public Long getOriginalStartRowId() { + return startRowOriginalIndex; + } + + /** + * The record index to be used in user-exposed features (expression language, filtering, UI…). If + * {@link #getOriginalStartRowId()} is defined then it is returned, otherwise this is {@link #getStartRowId()}. + */ + public long getLogicalStartRowId() { + return startRowOriginalIndex != null ? startRowOriginalIndex : startRowIndex; + } + + /** + * The rows contained in this record. + */ public List getRows() { return rows; } @@ -79,9 +119,15 @@ public Iterable getIndexedRows() { @Override public Iterator iterator() { - return IntStream.range(0, rows.size()) - .mapToObj(i -> new IndexedRow(startRowIndex + i, rows.get(i))) - .iterator(); + if (startRowOriginalIndex == null) { + return IntStream.range(0, rows.size()) + .mapToObj(i -> new IndexedRow(startRowIndex + i, rows.get(i))) + .iterator(); + } else { + return IntStream.range(0, rows.size()) + .mapToObj(i -> new IndexedRow(startRowIndex + i, startRowOriginalIndex + i, rows.get(i))) + .iterator(); + } } }; @@ -98,7 +144,9 @@ public boolean equals(Object other) { return false; } Record otherRecord = (Record) other; - return startRowIndex == otherRecord.getStartRowId() && rows.equals(otherRecord.getRows()); + return (startRowIndex == otherRecord.getStartRowId() && + Objects.equals(startRowOriginalIndex, otherRecord.getOriginalStartRowId()) && + rows.equals(otherRecord.getRows())); } @Override @@ -108,9 +156,16 @@ public int hashCode() { @Override public String toString() { - return String.format("[Record, id %d, rows:\n%s\n]", - startRowIndex, - String.join("\n", rows.stream().map(r -> r.toString()).collect(Collectors.toList()))); + if (startRowOriginalIndex == null) { + return String.format("[Record, id %d, rows:\n%s\n]", + startRowIndex, + rows.stream().map(Row::toString).collect(Collectors.joining("\n"))); + } else { + return String.format("[Record (%d -> %d), rows:\n%s\n]", + startRowOriginalIndex, + startRowIndex, + rows.stream().map(Row::toString).collect(Collectors.joining("\n"))); + } } /** @@ -125,7 +180,8 @@ public static boolean isRecordStart(Row row, int keyCellIndex) { * Groups a stream of indexed rows into a stream of records. * * @param parentIter - * the iterator of rows + * the iterator of rows. They are supposed to be in their original order, meaning that all their + * {@link IndexedRow#getOriginalIndex()} is null. * @param keyCellIndex * the index of the column used to group rows into records * @param ignoreFirstRows @@ -171,9 +227,11 @@ public Record next() { private void buildNextRecord() { List rows = new ArrayList<>(); long startRowId = 0; + Long originalStartRowId = null; if (fetchedRowTuple != null) { rows.add(fetchedRowTuple.getRow()); startRowId = fetchedRowTuple.getIndex(); + originalStartRowId = fetchedRowTuple.getOriginalIndex(); fetchedRowTuple = null; } while (parentIter.hasNext()) { @@ -189,10 +247,9 @@ private void buildNextRecord() { rows.addAll(additionalRows); additionalRowsConsumed = true; } - nextRecord = rows.isEmpty() ? null : new Record(startRowId, rows); + nextRecord = rows.isEmpty() ? null : new Record(startRowId, originalStartRowId, rows); } }; } - } diff --git a/refine-model/src/test/java/org/openrefine/browsing/EngineTests.java b/refine-model/src/test/java/org/openrefine/browsing/EngineTests.java index 5ee97f868f2d..044b54b0369f 100644 --- a/refine-model/src/test/java/org/openrefine/browsing/EngineTests.java +++ b/refine-model/src/test/java/org/openrefine/browsing/EngineTests.java @@ -171,7 +171,7 @@ public void testLimitReached() { public void testGetMatchingRows() { @SuppressWarnings("unchecked") Iterable mockIterable = mock(Iterable.class); - when(initialState.iterateRows(Mockito.any(), Mockito.eq(SortingConfig.NO_SORTING))).thenReturn(mockIterable); + when(initialState.iterateRows(Mockito.any())).thenReturn(mockIterable); Assert.assertEquals(engine.getMatchingRows(SortingConfig.NO_SORTING), mockIterable); } diff --git a/refine-model/src/test/java/org/openrefine/model/IndexedRowTests.java b/refine-model/src/test/java/org/openrefine/model/IndexedRowTests.java index 531e58171131..33493419aa56 100644 --- a/refine-model/src/test/java/org/openrefine/model/IndexedRowTests.java +++ b/refine-model/src/test/java/org/openrefine/model/IndexedRowTests.java @@ -31,4 +31,23 @@ public void readIndexedRow() throws JsonParseException, JsonMappingException, IO IndexedRow.class); Assert.assertEquals(tuple.getIndex(), 1234L); } + + @Test + public void saveIndexedRowWithOriginalId() { + Row row = new Row(Arrays.asList(new Cell("I'm not empty", null))); + IndexedRow tuple = new IndexedRow(1234L, 5678L, row); + TestUtils.isSerializedTo( + tuple, + "{\"i\":1234,\"o\":5678,\"r\":{\"flagged\":false,\"starred\":false,\"cells\":[{\"v\":\"I'm not empty\"}]}}", + ParsingUtilities.defaultWriter); + } + + @Test + public void readIndexedRowWithOriginalId() throws JsonParseException, JsonMappingException, IOException { + IndexedRow tuple = ParsingUtilities.mapper.readValue( + "{\"i\":1234,\"o\":5678,\"r\":{\"flagged\":false,\"starred\":false,\"cells\":[{\"v\":\"I'm not empty\"}]}}", + IndexedRow.class); + Assert.assertEquals(tuple.getIndex(), 1234L); + Assert.assertEquals(tuple.getOriginalIndex(), 5678L); + } } diff --git a/refine-model/src/test/java/org/openrefine/model/RecordTests.java b/refine-model/src/test/java/org/openrefine/model/RecordTests.java index 708598c064e2..b471f46a93e7 100644 --- a/refine-model/src/test/java/org/openrefine/model/RecordTests.java +++ b/refine-model/src/test/java/org/openrefine/model/RecordTests.java @@ -18,6 +18,7 @@ public class RecordTests { List rows; Record SUT; + Record sortedRecord; @BeforeMethod public void setUp() { @@ -26,6 +27,7 @@ public void setUp() { row(null, "c"), row("", "d")); SUT = new Record(34L, rows); + sortedRecord = new Record(34L, 56L, rows); } @Test @@ -34,6 +36,11 @@ public void testAccessors() { Assert.assertEquals(SUT.getStartRowId(), 34L); Assert.assertEquals(SUT.getEndRowId(), 37L); Assert.assertEquals(SUT.size(), 3); + Assert.assertEquals(SUT.getOriginalStartRowId(), null); + Assert.assertEquals(SUT.getLogicalStartRowId(), 34L); + + Assert.assertEquals(sortedRecord.getOriginalStartRowId(), 56L); + Assert.assertEquals(sortedRecord.getLogicalStartRowId(), 56L); } @Test @@ -86,6 +93,7 @@ public void testEquals() { Assert.assertNotEquals(SUT, 23); Assert.assertNotEquals(SUT, new Record(34L, Collections.emptyList())); Assert.assertEquals(SUT, new Record(34L, rows)); + Assert.assertNotEquals(SUT, sortedRecord); } @Test @@ -96,6 +104,7 @@ public void testHashCode() { @Test public void testToString() { Assert.assertTrue(SUT.toString().contains("Record")); + Assert.assertTrue(sortedRecord.toString().contains("56")); } } diff --git a/refine-spark-runner/src/main/java/org/openrefine/model/SparkGridState.java b/refine-spark-runner/src/main/java/org/openrefine/model/SparkGridState.java index 97d6d814cd73..8b4d1eefb2b8 100644 --- a/refine-spark-runner/src/main/java/org/openrefine/model/SparkGridState.java +++ b/refine-spark-runner/src/main/java/org/openrefine/model/SparkGridState.java @@ -68,7 +68,7 @@ public class SparkGridState implements GridState { protected final Map overlayModels; protected final ColumnModel columnModel; - protected final JavaPairRDD grid; + protected final JavaPairRDD grid; // not final because it is initialized on demand, as creating // it involves running a (small) Spark job protected JavaPairRDD records = null; @@ -94,6 +94,22 @@ public SparkGridState( this(columnModel, grid, overlayModels, runner, -1, -1); } + /** + * Creates a grid state from a grid and a column model + * + * @param grid + * the state of the table + * @param columnModel + * the header of the table + */ + public SparkGridState( + JavaPairRDD grid, + ColumnModel columnModel, + Map overlayModels, + SparkDatamodelRunner runner) { + this(grid, columnModel, overlayModels, runner, -1, -1); + } + /** * Creates a grid state from a grid and a column model * @@ -112,15 +128,45 @@ protected SparkGridState( long cachedRowCount, long cachedRecordCount) { this.columnModel = columnModel; + // Ensure that the grid has a partitioner - this.grid = SortedRDD.assumeSorted(grid); + JavaPairRDD rdd = grid + .mapToPair(tuple -> new Tuple2(tuple._1, new IndexedRow(tuple._1, tuple._2))); + this.grid = SortedRDD.assumeSorted(rdd); this.cachedRowCount = cachedRowCount; this.cachedRecordCount = cachedRecordCount; this.overlayModels = immutableMap(overlayModels); this.runner = runner; + } + + /** + * Creates a grid state from a grid and a column model + * + * @param grid + * the state of the table + * @param columnModel + * the header of the table + * @param cachedRowCount + * the number of rows in the table, cached + */ + protected SparkGridState( + JavaPairRDD grid, + ColumnModel columnModel, + Map overlayModels, + SparkDatamodelRunner runner, + long cachedRowCount, + long cachedRecordCount) { + this.columnModel = columnModel; + // Ensure that the grid has a partitioner + this.grid = SortedRDD.assumeSorted(grid); + + this.cachedRowCount = cachedRowCount; + this.cachedRecordCount = cachedRecordCount; + this.overlayModels = immutableMap(overlayModels); + this.runner = runner; } private ImmutableMap immutableMap(Map map) { @@ -149,7 +195,7 @@ public DatamodelRunner getDatamodelRunner() { * @return the grid data at this stage of the workflow */ @JsonIgnore - public JavaPairRDD getGrid() { + public JavaPairRDD getGrid() { return grid; } @@ -175,26 +221,26 @@ public boolean isRecordCountCached() { */ @JsonIgnore public JavaRDD getIndexedRows() { - return grid.map(t -> new IndexedRow(t._1, t._2)); + return grid.values(); } @Override public Row getRow(long id) { - List rows = grid.lookup(id); + List rows = grid.lookup(id); if (rows.size() == 0) { throw new IndexOutOfBoundsException(String.format("Row id %d not found", id)); } else if (rows.size() > 1) { throw new IllegalStateException(String.format("Found %d rows at index %d", rows.size(), id)); } else { - return rows.get(0); + return rows.get(0).getRow(); } } @Override - public List getRows(long start, int limit) { - return RDDUtils.paginate(grid, start, limit) + public List getRowsAfter(long start, int limit) { + return RDDUtils.paginateAfter(grid, start, limit) .stream() - .map(tuple -> new IndexedRow(tuple._1, tuple._2)) + .map(Tuple2::_2) .collect(Collectors.toList()); } @@ -211,61 +257,54 @@ public List getRows(List rowIndices) { } @Override - public List getRows(RowFilter filter, SortingConfig sortingConfig, long start, int limit) { - JavaPairRDD filteredGrid = grid; + public List getRowsAfter(RowFilter filter, long start, int limit) { + JavaPairRDD filteredGrid = grid; if (!filter.equals(RowFilter.ANY_ROW)) { filteredGrid = grid.filter(wrapRowFilter(filter)); } - if (sortingConfig.equals(SortingConfig.NO_SORTING)) { - // Without sorting, we can rely on row ids to paginate - return RDDUtils.paginate(filteredGrid, start, limit) - .stream() - .map(tuple -> new IndexedRow(tuple._1, tuple._2)) - .collect(Collectors.toList()); - } else { - RowSorter sorter = new RowSorter(this, sortingConfig); - // If we have a sorter, pagination is less efficient since we cannot rely - // on the partitioner to locate the rows in the appropriate partition - List rows = filteredGrid - .map(t -> new IndexedRow(t._1, t._2)) - .keyBy(ir -> ir) - .sortByKey(sorter) - .values() - .take((int) start + limit); - return rows - .subList(Math.min((int) start, rows.size()), Math.min((int) start + limit, rows.size())); + // Without sorting, we can rely on row ids to paginate + return RDDUtils.paginateAfter(filteredGrid, start, limit) + .stream() + .map(Tuple2::_2) + .collect(Collectors.toList()); + } + + @Override + public List getRowsBefore(long end, int limit) { + return getRowsBefore(RowFilter.ANY_ROW, end, limit); + } + + @Override + public List getRowsBefore(RowFilter filter, long end, int limit) { + JavaPairRDD filteredGrid = grid; + if (!filter.equals(RowFilter.ANY_ROW)) { + filteredGrid = grid.filter(wrapRowFilter(filter)); } + // Without sorting, we can rely on row ids to paginate + return RDDUtils.paginateBefore(filteredGrid, end, limit) + .stream() + .map(Tuple2::_2) + .collect(Collectors.toList()); } - private static Function, Boolean> wrapRowFilter(RowFilter filter) { - return new Function, Boolean>() { + private static Function, Boolean> wrapRowFilter(RowFilter filter) { + return new Function, Boolean>() { private static final long serialVersionUID = 2093008247452689031L; @Override - public Boolean call(Tuple2 tuple) throws Exception { - return filter.filterRow(tuple._1, tuple._2); + public Boolean call(Tuple2 tuple) throws Exception { + return filter.filterRow(tuple._2.getLogicalIndex(), tuple._2.getRow()); } }; } @Override - public Iterable iterateRows(RowFilter filter, SortingConfig sortingConfig) { - JavaRDD filtered; - if (SortingConfig.NO_SORTING.equals(sortingConfig)) { - filtered = grid - .filter(wrapRowFilter(filter)) - .map(t -> new IndexedRow(t._1, t._2)); - } else { - RowSorter sorter = new RowSorter(this, sortingConfig); - filtered = grid - .filter(wrapRowFilter(filter)) - .map(t -> new IndexedRow(t._1, t._2)) - .keyBy(ir -> ir) - .sortByKey(sorter) - .values(); - } + public Iterable iterateRows(RowFilter filter) { + JavaRDD filtered = grid + .filter(wrapRowFilter(filter)) + .values(); return new Iterable() { @@ -279,7 +318,7 @@ public Iterator iterator() { @Override public List collectRows() { - return grid.map(tuple -> new IndexedRow(tuple._1, tuple._2)).collect(); + return grid.values().collect(); } /** @@ -322,14 +361,15 @@ public ApproxCount call(ApproxCount v1, ApproxCount v2) throws Exception { }; } - private static Function2, ApproxCount> approxRowFilterAggregator(RowFilter filter, long rowLimit) { - return new Function2, ApproxCount>() { + private static Function2, ApproxCount> approxRowFilterAggregator(RowFilter filter, + long rowLimit) { + return new Function2, ApproxCount>() { private static final long serialVersionUID = -54284705503006433L; @Override - public ApproxCount call(ApproxCount count, Tuple2 tuple) throws Exception { - long matched = count.getMatched() + (filter.filterRow(tuple._1, tuple._2) ? 1 : 0); + public ApproxCount call(ApproxCount count, Tuple2 tuple) throws Exception { + long matched = count.getMatched() + (filter.filterRow(tuple._2.getLogicalIndex(), tuple._2.getRow()) ? 1 : 0); return new ApproxCount(count.getProcessed() + 1, matched, count.limitReached() || count.getProcessed() + 1 == rowLimit); } @@ -360,32 +400,36 @@ public Record getRecord(long id) { } @Override - public List getRecords(long start, int limit) { - return RDDUtils.paginate(getRecords(), start, limit) + public List getRecordsAfter(long start, int limit) { + return RDDUtils.paginateAfter(getRecords(), start, limit) .stream() .map(tuple -> tuple._2) .collect(Collectors.toList()); } @Override - public List getRecords(RecordFilter filter, SortingConfig sortingConfig, long start, int limit) { + public List getRecordsAfter(RecordFilter filter, long start, int limit) { JavaPairRDD filteredRecords = getRecords(); if (!filter.equals(RecordFilter.ANY_RECORD)) { filteredRecords = filteredRecords.filter(wrapRecordFilter(filter)); } - if (SortingConfig.NO_SORTING.equals(sortingConfig)) { - return RDDUtils.paginate(filteredRecords, start, limit) - .stream().map(tuple -> tuple._2).collect(Collectors.toList()); - } else { - RecordSorter sorter = new RecordSorter(this, sortingConfig); - return filteredRecords - .values() - .keyBy(record -> record) - .sortByKey(sorter) - .values() - .take((int) start + limit) - .subList((int) start, (int) start + limit); + return RDDUtils.paginateAfter(filteredRecords, start, limit) + .stream().map(tuple -> tuple._2).collect(Collectors.toList()); + } + + @Override + public List getRecordsBefore(long end, int limit) { + return getRecordsBefore(RecordFilter.ANY_RECORD, end, limit); + } + + @Override + public List getRecordsBefore(RecordFilter filter, long end, int limit) { + JavaPairRDD filteredRecords = getRecords(); + if (!filter.equals(RecordFilter.ANY_RECORD)) { + filteredRecords = filteredRecords.filter(wrapRecordFilter(filter)); } + return RDDUtils.paginateBefore(filteredRecords, end, limit) + .stream().map(tuple -> tuple._2).collect(Collectors.toList()); } private static Function, Boolean> wrapRecordFilter(RecordFilter filter) { @@ -402,21 +446,10 @@ public Boolean call(Tuple2 tuple) throws Exception { } @Override - public Iterable iterateRecords(RecordFilter filter, SortingConfig sortingConfig) { - JavaRDD filtered; - if (SortingConfig.NO_SORTING.equals(sortingConfig)) { - filtered = getRecords() - .filter(wrapRecordFilter(filter)) - .values(); - } else { - RecordSorter sorter = new RecordSorter(this, sortingConfig); - filtered = getRecords() - .filter(wrapRecordFilter(filter)) - .values() - .keyBy(record -> record) - .sortByKey(sorter) - .values(); - } + public Iterable iterateRecords(RecordFilter filter) { + JavaRDD filtered = getRecords() + .filter(wrapRecordFilter(filter)) + .values(); return new Iterable() { @Override @@ -515,8 +548,8 @@ public void saveToFile(File file, ProgressReporter progressReporter) throws IOEx progressReporter.reportProgress(100); } - protected static String serializeIndexedRow(Tuple2 indexedRow) throws JsonProcessingException { - return ParsingUtilities.saveWriter.writeValueAsString(new IndexedRow(indexedRow._1, indexedRow._2)); + protected static String serializeIndexedRow(Tuple2 indexedRow) throws JsonProcessingException { + return ParsingUtilities.saveWriter.writeValueAsString(indexedRow._2); } public static SparkGridState loadFromFile(JavaSparkContext context, File file) throws IOException { @@ -581,27 +614,27 @@ public PartialAggregation aggregateRecordsApprox(Rec .aggregate(initialPartialState, recordSeqOpPartial(aggregator, partitionLimit), facetCombineOpPartial(aggregator)); } - private static Function2, T> rowSeqOp(RowAggregator aggregator) { - return new Function2, T>() { + private static Function2, T> rowSeqOp(RowAggregator aggregator) { + return new Function2, T>() { private static final long serialVersionUID = 2188564367142265354L; @Override - public T call(T states, Tuple2 rowTuple) throws Exception { - return aggregator.withRow(states, rowTuple._1, rowTuple._2); + public T call(T states, Tuple2 rowTuple) throws Exception { + return aggregator.withRow(states, rowTuple._2.getLogicalIndex(), rowTuple._2.getRow()); } }; } - private static Function2, Tuple2, PartialAggregation> rowSeqOpPartial( + private static Function2, Tuple2, PartialAggregation> rowSeqOpPartial( RowAggregator aggregator, long limit) { - return new Function2, Tuple2, PartialAggregation>() { + return new Function2, Tuple2, PartialAggregation>() { private static final long serialVersionUID = 2188564367142265354L; @Override - public PartialAggregation call(PartialAggregation states, Tuple2 rowTuple) throws Exception { - T newState = aggregator.withRow(states.getState(), rowTuple._1, rowTuple._2); + public PartialAggregation call(PartialAggregation states, Tuple2 rowTuple) throws Exception { + T newState = aggregator.withRow(states.getState(), rowTuple._2.getLogicalIndex(), rowTuple._2.getRow()); return new PartialAggregation(newState, states.getProcessed() + 1, states.limitReached() || (states.getProcessed() + 1 == limit)); } @@ -669,18 +702,18 @@ public PartialAggregation call(PartialAggregation statesA, PartialAggregat @Override public SparkGridState mapRows(RowMapper mapper, ColumnModel newColumnModel) { - JavaPairRDD rows = RDDUtils.mapKeyValuesToValues(grid, rowMap(mapper)); - return new SparkGridState(newColumnModel, rows, overlayModels, runner, cachedRowCount, -1); + JavaPairRDD rows = RDDUtils.mapKeyValuesToValues(grid, rowMap(mapper)); + return new SparkGridState(rows, newColumnModel, overlayModels, runner, cachedRowCount, -1); } - private static Function2 rowMap(RowMapper mapper) { - return new Function2() { + private static Function2 rowMap(RowMapper mapper) { + return new Function2() { private static final long serialVersionUID = 429225090136968798L; @Override - public Row call(Long id, Row row) throws Exception { - return mapper.call(id, row); + public IndexedRow call(Long id, IndexedRow row) throws Exception { + return new IndexedRow(row.getIndex(), row.getOriginalIndex(), mapper.call(row.getLogicalIndex(), row.getRow())); } }; } @@ -691,14 +724,14 @@ public GridState flatMapRows(RowFlatMapper mapper, ColumnModel newColumnModel) { return new SparkGridState(newColumnModel, rows, overlayModels, runner); } - private static FlatMapFunction, Row> rowFlatMap(RowFlatMapper mapper) { - return new FlatMapFunction, Row>() { + private static FlatMapFunction, Row> rowFlatMap(RowFlatMapper mapper) { + return new FlatMapFunction, Row>() { private static final long serialVersionUID = -2920197696120331752L; @Override - public Iterator call(Tuple2 t) throws Exception { - return mapper.call(t._1, t._2).iterator(); + public Iterator call(Tuple2 t) throws Exception { + return mapper.call(t._2.getLogicalIndex(), t._2.getRow()).iterator(); } }; @@ -706,31 +739,31 @@ public Iterator call(Tuple2 t) throws Exception { @Override public GridState mapRows(RowScanMapper mapper, ColumnModel newColumnModel) { - RDD> rows = new ScanMapRDD, Tuple2>( + RDD> rows = new ScanMapRDD, Tuple2>( grid.rdd(), rowScanMapperToFeed(mapper), rowScanMapperToCombine(mapper), rowScanMapperToMap(mapper), mapper.unit(), - RDDUtils.ROW_TUPLE2_TAG, - RDDUtils.ROW_TUPLE2_TAG, + RDDUtils.INDEXEDROW_TUPLE2_TAG, + RDDUtils.INDEXEDROW_TUPLE2_TAG, ClassManifestFactory.fromClass(Serializable.class)); return new SparkGridState( + new JavaPairRDD(rows, RDDUtils.LONG_TAG, RDDUtils.INDEXEDROW_TAG), newColumnModel, - new JavaPairRDD(rows, RDDUtils.LONG_TAG, RDDUtils.ROW_TAG), overlayModels, runner); } - private static Function, Serializable> rowScanMapperToFeed(RowScanMapper mapper) { - return new Function, Serializable>() { + private static Function, Serializable> rowScanMapperToFeed(RowScanMapper mapper) { + return new Function, Serializable>() { private static final long serialVersionUID = -5264889389519072017L; @Override - public Serializable call(Tuple2 tuple) throws Exception { - return mapper.feed(tuple._1, tuple._2); + public Serializable call(Tuple2 tuple) throws Exception { + return mapper.feed(tuple._2.getLogicalIndex(), tuple._2.getRow()); } }; @@ -751,16 +784,17 @@ public Serializable call(Serializable v1, Serializable v2) throws Exception { }; } - private static Function2, Tuple2> rowScanMapperToMap( + private static Function2, Tuple2> rowScanMapperToMap( RowScanMapper mapper) { - return new Function2, Tuple2>() { + return new Function2, Tuple2>() { private static final long serialVersionUID = -321428794497355320L; @SuppressWarnings("unchecked") @Override - public Tuple2 call(Serializable v1, Tuple2 v2) throws Exception { - return new Tuple2(v2._1, mapper.map((S) v1, v2._1, v2._2)); + public Tuple2 call(Serializable v1, Tuple2 v2) throws Exception { + return new Tuple2(v2._1, + new IndexedRow(v2._1, v2._2.getOriginalIndex(), mapper.map((S) v1, v2._2.getLogicalIndex(), v2._2.getRow()))); } }; @@ -774,7 +808,7 @@ public SparkGridState mapRecords(RecordMapper mapper, ColumnModel newColumnModel JavaPairRDD> newRows = getRecords().flatMapValues(rowPreservingRecordMap(mapper)); rows = new PartitionedRDD(JavaPairRDD.fromJavaRDD(newRows.values()), newRows.partitioner().get()) - .asPairRDD(newRows.kClassTag(), grid.vClassTag()); + .asPairRDD(newRows.kClassTag(), RDDUtils.ROW_TAG); } else { // We need to recompute row ids and get a new partitioner JavaRDD newRows = getRecords().values().flatMap(recordMap(mapper)); @@ -818,11 +852,22 @@ public Iterator call(Record record) throws Exception { }; } + private static FlatMapFunction recordToIndexedRows = new FlatMapFunction() { + + private static final long serialVersionUID = 6663749328661449792L; + + @Override + public Iterator call(Record record) throws Exception { + return record.getIndexedRows().iterator(); + } + + }; + @Override public SparkGridState withOverlayModels(Map newOverlayModels) { return new SparkGridState( - columnModel, grid, + columnModel, newOverlayModels, runner, cachedRowCount, @@ -832,8 +877,8 @@ public SparkGridState withOverlayModels(Map newOverlayMode @Override public GridState withColumnModel(ColumnModel newColumnModel) { return new SparkGridState( - newColumnModel, grid, + newColumnModel, overlayModels, runner, cachedRowCount, @@ -841,51 +886,79 @@ public GridState withColumnModel(ColumnModel newColumnModel) { } @Override - public GridState reorderRows(SortingConfig sortingConfig) { + public GridState reorderRows(SortingConfig sortingConfig, boolean permanent) { RowSorter sorter = new RowSorter(this, sortingConfig); // TODO: we should map by the keys generated by the sortingConfig, // and provide a comparator for those: that could be more efficient - JavaPairRDD sortedGrid = RDDUtils.zipWithIndex( - getIndexedRows() - .keyBy(ir -> ir) - .sortByKey(sorter) - .values() - .map(ir -> ir.getRow())); - return new SparkGridState( - columnModel, - sortedGrid, - overlayModels, - runner, - cachedRowCount, - -1); // reordering rows can change the number of records + JavaRDD sortedIndexedRows = getIndexedRows() + .keyBy(ir -> ir) + .sortByKey(sorter) + .values(); + if (permanent) { + JavaPairRDD sortedGrid = RDDUtils.zipWithIndex( + sortedIndexedRows + .map(ir -> ir.getRow())); + return new SparkGridState( + columnModel, + sortedGrid, + overlayModels, + runner, + cachedRowCount, + -1); // reordering rows can change the number of records + } else { + JavaPairRDD zipped = RDDUtils.mapKeyValuesToValues(RDDUtils.zipWithIndex(sortedIndexedRows), + (id, indexedRow) -> new IndexedRow(id, indexedRow.getLogicalIndex(), indexedRow.getRow())); + return new SparkGridState( + zipped, + columnModel, + overlayModels, + runner, + cachedRowCount, + -1); // reordering rows can change the number of records + + } } @Override - public GridState reorderRecords(SortingConfig sortingConfig) { + public GridState reorderRecords(SortingConfig sortingConfig, boolean permanent) { RecordSorter sorter = new RecordSorter(this, sortingConfig); // TODO: we should map by the keys generated by the sortingConfig, // and provide a comparator for those: that could be more efficient - JavaPairRDD sortedGrid = RDDUtils.zipWithIndex( - getRecords() - .values() - .keyBy(record -> record) - .sortByKey(sorter) - .values() - .flatMap(recordMap(RecordMapper.IDENTITY))); - return new SparkGridState( - columnModel, - sortedGrid, - overlayModels, - runner, - cachedRowCount, - cachedRecordCount); // reordering records does not change their count + JavaRDD records = getRecords() + .values() + .keyBy(record -> record) + .sortByKey(sorter) + .values(); + if (permanent) { + JavaPairRDD sortedGrid = RDDUtils.zipWithIndex( + records.flatMap(recordMap(RecordMapper.IDENTITY))); + return new SparkGridState( + columnModel, + sortedGrid, + overlayModels, + runner, + cachedRowCount, + cachedRecordCount); // reordering records does not change their count + } else { + JavaPairRDD sortedGrid = RDDUtils.mapKeyValuesToValues(RDDUtils.zipWithIndex( + records.flatMap(recordToIndexedRows)), + (key, indexedRow) -> new IndexedRow(key, indexedRow.getLogicalIndex(), indexedRow.getRow())); + return new SparkGridState( + sortedGrid, + columnModel, + overlayModels, + runner, + cachedRowCount, + cachedRecordCount); // reordering records does not change their count + } } @Override public GridState removeRows(RowFilter filter) { JavaPairRDD newRows = RDDUtils.zipWithIndex(grid .filter(wrapRowFilter(RowFilter.negate(filter))) - .values()); + .values() + .map(IndexedRow::getRow)); return new SparkGridState( columnModel, @@ -912,11 +985,11 @@ public GridState removeRecords(RecordFilter filter) { @Override public GridState limitRows(long rowLimit) { - JavaPairRDD limited = RDDUtils.limit(grid, rowLimit); + JavaPairRDD limited = RDDUtils.limit(grid, rowLimit); long newCachedRowCount = cachedRowCount == -1 ? -1 : Math.max(cachedRowCount, rowLimit); return new SparkGridState( - columnModel, limited, + columnModel, overlayModels, runner, newCachedRowCount, @@ -927,6 +1000,7 @@ public GridState limitRows(long rowLimit) { public GridState dropRows(long rowLimit) { JavaPairRDD newRows = grid .filter(wrapRowFilter(RowFilter.limitFilter(rowLimit))) + .mapValues(IndexedRow::getRow) .mapToPair(offsetRowIds(-rowLimit)); // Adapt the partitioner to the new row ids @@ -944,7 +1018,7 @@ public GridState dropRows(long rowLimit) { } } Partitioner newPartitioner = new SortedRDD.SortedPartitioner<>(oldPartitioner.numPartitions(), newIndices); - newRows = new PartitionedRDD(newRows, newPartitioner).asPairRDD(grid.kClassTag(), grid.vClassTag()); + newRows = new PartitionedRDD(newRows, newPartitioner).asPairRDD(grid.kClassTag(), RDDUtils.ROW_TAG); } // Compute the new row count @@ -972,14 +1046,14 @@ public Tuple2 call(Tuple2 t) throws Exception { }; } - private static Function2 rowMap(RowChangeDataProducer mapper) { - return new Function2() { + private static Function2 rowMap(RowChangeDataProducer mapper) { + return new Function2() { private static final long serialVersionUID = 429225090136968798L; @Override - public T call(Long id, Row row) throws Exception { - return mapper.call(id, row); + public T call(Long id, IndexedRow row) throws Exception { + return mapper.call(row.getLogicalIndex(), row.getRow()); } }; } @@ -987,11 +1061,11 @@ public T call(Long id, Row row) throws Exception { @Override public ChangeData mapRows(RowFilter filter, RowChangeDataProducer rowMapper) { JavaPairRDD data; - JavaPairRDD filteredGrid = grid.filter(wrapRowFilter(filter)); + JavaPairRDD filteredGrid = grid.filter(wrapRowFilter(filter)); if (rowMapper.getBatchSize() == 1) { data = RDDUtils.mapKeyValuesToValues(filteredGrid, rowMap(rowMapper)); } else { - JavaRDD> batched = RDDUtils.partitionWiseBatching(filteredGrid.map(t -> new IndexedRow(t._1, t._2)), + JavaRDD> batched = RDDUtils.partitionWiseBatching(filteredGrid.values(), rowMapper.getBatchSize()); data = JavaPairRDD.fromJavaRDD(batched.flatMap(batchedRowMap(rowMapper))); } @@ -1076,21 +1150,22 @@ public GridState join(ChangeData changeData, RowChangeDataJoiner rowJo // TODO this left outer join does not rely on the fact that the RDDs are sorted by key // and there could be spurious shuffles if the partitioners differ slightly. // the last sort could be avoided as well (but by default leftOuterJoin will not preserve order…) - JavaPairRDD>> join = grid.leftOuterJoin(sparkChangeData.getData()).sortByKey(); - JavaPairRDD newGrid = RDDUtils.mapKeyValuesToValues(join, wrapJoiner(rowJoiner)); + JavaPairRDD>> join = grid.leftOuterJoin(sparkChangeData.getData()).sortByKey(); + JavaPairRDD newGrid = RDDUtils.mapKeyValuesToValues(join, wrapJoiner(rowJoiner)); - return new SparkGridState(newColumnModel, newGrid, overlayModels, runner); + return new SparkGridState(newGrid, newColumnModel, overlayModels, runner); } - private static Function2>, Row> wrapJoiner(RowChangeDataJoiner joiner) { + private static Function2>, IndexedRow> wrapJoiner(RowChangeDataJoiner joiner) { - return new Function2>, Row>() { + return new Function2>, IndexedRow>() { private static final long serialVersionUID = 3976239801526423806L; @Override - public Row call(Long id, Tuple2> tuple) throws Exception { - return joiner.call(id, tuple._1, tuple._2.or(null)); + public IndexedRow call(Long id, Tuple2> tuple) throws Exception { + return new IndexedRow(id, tuple._1.getOriginalIndex(), + joiner.call(tuple._1.getLogicalIndex(), tuple._1.getRow(), tuple._2.or(null))); } }; } @@ -1105,21 +1180,21 @@ public GridState join(ChangeData changeData, RowChangeDataFlatJoiner r // TODO this left outer join does not rely on the fact that the RDDs are sorted by key // and there could be spurious shuffles if the partitioners differ slightly. // the last sort could be avoided as well (but by default leftOuterJoin will not preserve order…) - JavaPairRDD>> join = grid.leftOuterJoin(sparkChangeData.getData()).sortByKey(); + JavaPairRDD>> join = grid.leftOuterJoin(sparkChangeData.getData()).sortByKey(); JavaPairRDD> newGrid = RDDUtils.mapKeyValuesToValues(join, wrapFlatJoiner(rowJoiner)); JavaPairRDD flattened = RDDUtils.zipWithIndex(newGrid.values().flatMap(l -> l.iterator())); return new SparkGridState(newColumnModel, flattened, overlayModels, runner); } - private static Function2>, List> wrapFlatJoiner(RowChangeDataFlatJoiner joiner) { + private static Function2>, List> wrapFlatJoiner(RowChangeDataFlatJoiner joiner) { - return new Function2>, List>() { + return new Function2>, List>() { private static final long serialVersionUID = 3976239801526423806L; @Override - public List call(Long id, Tuple2> tuple) throws Exception { - return joiner.call(id, tuple._1, tuple._2.or(null)); + public List call(Long id, Tuple2> tuple) throws Exception { + return joiner.call(tuple._1.getLogicalIndex(), tuple._1.getRow(), tuple._2.or(null)); } }; } @@ -1163,7 +1238,7 @@ public GridState concatenate(GridState other) { ColumnModel merged = columnModel.merge(other.getColumnModel()); SparkGridState sparkGrid = (SparkGridState) other; - JavaRDD rows = grid.values().union(sparkGrid.getGrid().values()); + JavaRDD rows = grid.values().union(sparkGrid.getGrid().values()).map(IndexedRow::getRow); JavaPairRDD indexedRows = RDDUtils.zipWithIndex(rows); Map mergedOverlayModels = new HashMap<>(other.getOverlayModels()); mergedOverlayModels.putAll(overlayModels); diff --git a/refine-spark-runner/src/main/java/org/openrefine/model/rdd/RecordRDD.java b/refine-spark-runner/src/main/java/org/openrefine/model/rdd/RecordRDD.java index 0c6dde037026..ffbb22186e97 100644 --- a/refine-spark-runner/src/main/java/org/openrefine/model/rdd/RecordRDD.java +++ b/refine-spark-runner/src/main/java/org/openrefine/model/rdd/RecordRDD.java @@ -88,7 +88,7 @@ protected UnfinishedRecord(List rows, Long firstRecordStart) { } } - private final ClassTag> classTag; + private final ClassTag> classTag; private final int keyCellIndex; private final UnfinishedRecord[] firstRows; private final Partitioner sortedPartitioner; @@ -101,7 +101,7 @@ protected UnfinishedRecord(List rows, Long firstRecordStart) { * @param keyCellIndex * the column index used to determine record boundaries */ - public RecordRDD(JavaPairRDD prev, int keyCellIndex) { + public RecordRDD(JavaPairRDD prev, int keyCellIndex) { super(prev.rdd(), TUPLE2_TAG); classTag = prev.classTag(); this.keyCellIndex = keyCellIndex; @@ -131,10 +131,10 @@ public Option partitioner() { @Override public Iterator> compute(Partition partition, TaskContext context) { RecordRDDPartition recordPartition = (RecordRDDPartition) partition; - Iterator> parentIter = this.firstParent(classTag).iterator(recordPartition.prev, context); + Iterator> parentIter = this.firstParent(classTag).iterator(recordPartition.prev, context); java.util.Iterator indexedRows = Iterators.transform( JavaConverters.asJavaIterator(parentIter), - tuple -> new IndexedRow(tuple._1, tuple._2)); + Tuple2::_2); java.util.Iterator records = Record.groupIntoRecords( indexedRows, keyCellIndex, @@ -182,7 +182,7 @@ public Partition[] getPartitions() { * */ protected static class ExtractFirstRecord - implements Function2>, UnfinishedRecord>, Serializable { + implements Function2>, UnfinishedRecord>, Serializable { private static final long serialVersionUID = 8473670054764783718L; private final int keyCellIndex; @@ -192,15 +192,15 @@ protected static class ExtractFirstRecord } @Override - public UnfinishedRecord apply(TaskContext v1, Iterator> iterator) { + public UnfinishedRecord apply(TaskContext v1, Iterator> iterator) { List currentRows = new ArrayList<>(); Long firstRecordStart = null; while (firstRecordStart == null && iterator.hasNext()) { - Tuple2 tuple = iterator.next(); - if (Record.isRecordStart(tuple._2, keyCellIndex)) { + Tuple2 tuple = iterator.next(); + if (Record.isRecordStart(tuple._2.getRow(), keyCellIndex)) { firstRecordStart = tuple._1; } else { - currentRows.add(tuple._2); + currentRows.add(tuple._2.getRow()); } } return new UnfinishedRecord(currentRows, firstRecordStart); diff --git a/refine-spark-runner/src/main/java/org/openrefine/util/RDDUtils.java b/refine-spark-runner/src/main/java/org/openrefine/util/RDDUtils.java index d49a32306b80..1c7db6b7d77c 100644 --- a/refine-spark-runner/src/main/java/org/openrefine/util/RDDUtils.java +++ b/refine-spark-runner/src/main/java/org/openrefine/util/RDDUtils.java @@ -1,9 +1,7 @@ package org.openrefine.util; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; +import java.util.*; import com.google.common.collect.Iterators; import org.apache.spark.Partitioner; @@ -13,12 +11,11 @@ import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.PairFlatMapFunction; -import org.apache.spark.rdd.OrderedRDDFunctions; import scala.Tuple2; -import scala.math.Ordering; import scala.reflect.ClassManifestFactory; import scala.reflect.ClassTag; +import org.openrefine.model.IndexedRow; import org.openrefine.model.Row; import org.openrefine.model.rdd.PartitionedRDD; import org.openrefine.model.rdd.SortedRDD.SortedPartitioner; @@ -35,8 +32,11 @@ public class RDDUtils { @SuppressWarnings("unchecked") public static final ClassTag> ROW_TUPLE2_TAG = ClassManifestFactory .fromClass((Class>) (Class) Tuple2.class); + public static final ClassTag> INDEXEDROW_TUPLE2_TAG = ClassManifestFactory + .fromClass((Class>) (Class) Tuple2.class); public static final ClassTag LONG_TAG = ClassManifestFactory.fromClass(Long.class); public static final ClassTag ROW_TAG = ClassManifestFactory.fromClass(Row.class); + public static final ClassTag INDEXEDROW_TAG = ClassManifestFactory.fromClass(IndexedRow.class); /** * Returns the first few records after a given index from an indexed RDD. If the RDD has a RangePartitioner (any @@ -50,14 +50,39 @@ public class RDDUtils { * the maximum number of records to return * @return the list of records corresponding to the requested page */ - public static List> paginate(JavaPairRDD rdd, long start, int limit) { + public static List> paginateAfter(JavaPairRDD rdd, long start, int limit) { if (start == 0) { return rdd.take(limit); } else { - return filterByRange(rdd, start, Long.MAX_VALUE).take(limit); + return rdd.filterByRange(start, Long.MAX_VALUE).take(limit); } } + /** + * Returns the last few records up to a given upper bound from an indexed RDD. If the RDD has a RangePartitioner + * (any sorted RDD), this will be achieved by only scanning the relevant partitions. + * + * @param rdd + * the RDD to extract the records from. + * @param end + * the minimum index (inclusive) to return + * @param limit + * the maximum number of records to return + * @return the list of records corresponding to the requested page + */ + public static List> paginateBefore(JavaPairRDD rdd, long end, int limit) { + // TODO this could be optimized (see the PLL implementation which is more efficient) + Iterator> iterator = rdd.filterByRange(Long.MIN_VALUE, end - 1).toLocalIterator(); + Deque> buffer = new ArrayDeque<>(limit); + while (iterator.hasNext()) { + if (buffer.size() == limit) { + buffer.removeFirst(); + } + buffer.addLast(iterator.next()); + } + return new ArrayList<>(buffer); + } + /** * This is what JavaRDD.zipWithIndex really ought to do * @@ -69,41 +94,6 @@ public static JavaPairRDD zipWithIndex(JavaRDD rdd) { return new ZippedWithIndexRDD(rdd).asPairRDD(); } - /** - * Efficiently filters a RDD which has a RangePartitioner (any sorted RDD) by pruning partitions which cannot - * contain keys outside the range, or falls back on regular filter if no RangePartitioner is available. - *

- * Workaround for SPARK-31518, which will be fixed - * in 3.1.0 - *

- * TODO remove this once 3.1.0 is released - * - * @param - * type of values - * @param rdd - * the RDD to filter - * @param lower - * the lower bound (inclusive) - * @param upper - * the upper bound (exclusive) - * @return a RDD containing only keys within the range - */ - public static JavaPairRDD filterByRange(JavaPairRDD rdd, long lower, long upper) { - Ordering ordering = new Ordering() { - - private static final long serialVersionUID = 1L; - - @Override - public int compare(Long x, Long y) { - return Long.compare(x, y); - } - }; - return JavaPairRDD.fromRDD(new OrderedRDDFunctions>( - rdd.rdd(), ordering, ClassManifestFactory.fromClass(Long.class), rdd.vClassTag(), rdd.classTag()) - .filterByRange(lower, upper), - rdd.kClassTag(), rdd.vClassTag()); - } - /** * Like JavaPairRDD.mapValues in that it preserves partitioning of the underlying RDD, but the mapping function has * also access to the key. diff --git a/refine-spark-runner/src/test/java/org/openrefine/SparkBasedTest.java b/refine-spark-runner/src/test/java/org/openrefine/SparkBasedTest.java index 316afc53e45d..2c5de3c01afe 100644 --- a/refine-spark-runner/src/test/java/org/openrefine/SparkBasedTest.java +++ b/refine-spark-runner/src/test/java/org/openrefine/SparkBasedTest.java @@ -17,7 +17,9 @@ import org.openrefine.io.OrderedLocalFileSystem; import org.openrefine.model.Cell; +import org.openrefine.model.IndexedRow; import org.openrefine.model.Row; +import org.openrefine.util.RDDUtils; public class SparkBasedTest { @@ -78,6 +80,11 @@ protected JavaPairRDD rowRDD(Serializable[][] cellValues, int numPart return rowRDD(cells, numPartitions); } + protected JavaPairRDD indexedRowRDD(Serializable[][] cellValues, int numPartitions) { + return RDDUtils.mapKeyValuesToValues(rowRDD(cellValues, numPartitions), + (key, value) -> new IndexedRow(key, value)); + } + protected JavaPairRDD rowRDD(Serializable[][] cells) { return rowRDD(cells, 2); } diff --git a/refine-spark-runner/src/test/java/org/openrefine/model/SparkGridStateTests.java b/refine-spark-runner/src/test/java/org/openrefine/model/SparkGridStateTests.java index 022aec41cb02..17204afc4536 100644 --- a/refine-spark-runner/src/test/java/org/openrefine/model/SparkGridStateTests.java +++ b/refine-spark-runner/src/test/java/org/openrefine/model/SparkGridStateTests.java @@ -75,12 +75,12 @@ public void testToString() { @Test public void testGetGrid() { - JavaPairRDD grid = state.getGrid(); - Row row1 = grid.lookup(0L).get(0); + JavaPairRDD grid = state.getGrid(); + Row row1 = grid.lookup(0L).get(0).getRow(); Assert.assertEquals(row1.getCellValue(0), 1); Assert.assertEquals(row1.getCellValue(1), 2); Assert.assertEquals(row1.getCellValue(2), "3"); - Row row2 = grid.lookup(1L).get(0); + Row row2 = grid.lookup(1L).get(0).getRow(); Assert.assertEquals(row2.getCellValue(0), 4); Assert.assertEquals(row2.getCellValue(1), "5"); Assert.assertEquals(row2.getCellValue(2), true); @@ -94,7 +94,7 @@ public void testSaveAndLoad() throws IOException { SparkGridState loaded = SparkGridState.loadFromFile(context(), tempFile); Assert.assertEquals(loaded.getOverlayModels(), state.getOverlayModels()); - List> loadedGrid = loaded.getGrid().collect(); + List> loadedGrid = loaded.getGrid().collect(); Assert.assertEquals(loadedGrid, state.getGrid().collect()); } @@ -109,7 +109,7 @@ public void testGetAllRecords() { @Test public void testGetRecords() { - Assert.assertEquals(state.getRecords(1L, 10), + Assert.assertEquals(state.getRecordsAfter(1L, 10), Collections.singletonList(new Record(1L, Collections.singletonList(rows.get(1)._2)))); } diff --git a/refine-spark-runner/src/test/java/org/openrefine/model/rdd/RecordRDDTests.java b/refine-spark-runner/src/test/java/org/openrefine/model/rdd/RecordRDDTests.java index c3ff54d8f5cc..cf3d3880a228 100644 --- a/refine-spark-runner/src/test/java/org/openrefine/model/rdd/RecordRDDTests.java +++ b/refine-spark-runner/src/test/java/org/openrefine/model/rdd/RecordRDDTests.java @@ -11,8 +11,8 @@ import org.testng.annotations.Test; import org.openrefine.SparkBasedTest; +import org.openrefine.model.IndexedRow; import org.openrefine.model.Record; -import org.openrefine.model.Row; public class RecordRDDTests extends SparkBasedTest { @@ -23,7 +23,7 @@ public Object[][] getPartitionNumbers() { @Test(dataProvider = "partitionNumbers") public void testSplitRecordsOverPartitions(int numPartitions) { - JavaPairRDD rdd = rowRDD(new Serializable[][] { + JavaPairRDD rdd = indexedRowRDD(new Serializable[][] { { "a", "b" }, { "", "c" }, { null, "d" }, @@ -51,7 +51,7 @@ public void testSplitRecordsOverPartitions(int numPartitions) { @Test(dataProvider = "partitionNumbers") public void testNoRecordKey(int numPartitions) { - JavaPairRDD rdd = rowRDD(new Serializable[][] { + JavaPairRDD rdd = indexedRowRDD(new Serializable[][] { { "", "b" }, { "", "c" }, { null, "d" }, @@ -73,7 +73,7 @@ public void testNoRecordKey(int numPartitions) { @Test(dataProvider = "partitionNumbers") public void testAllBlank(int numPartitions) { - JavaPairRDD rdd = rowRDD(new Serializable[][] { + JavaPairRDD rdd = indexedRowRDD(new Serializable[][] { { "", null }, { "", "" }, { null, null }, @@ -94,7 +94,7 @@ public void testAllBlank(int numPartitions) { @Test(dataProvider = "partitionNumbers") public void testCustomRecordKey(int numPartitions) { - JavaPairRDD rdd = rowRDD(new Serializable[][] { + JavaPairRDD rdd = indexedRowRDD(new Serializable[][] { { "a", "b" }, { "", "c" }, { null, "d" }, @@ -122,7 +122,7 @@ public void testCustomRecordKey(int numPartitions) { @Test public void testPartitioner() { - JavaPairRDD rdd = rowRDD(new Serializable[][] { + JavaPairRDD rdd = indexedRowRDD(new Serializable[][] { { "a", "b" }, { "", "c" }, { null, "d" }, @@ -132,7 +132,7 @@ public void testPartitioner() { { null, "i" }, { "j", "k" }, }, 4); - JavaPairRDD sortedRDD = SortedRDD.assumeSorted(rdd); + JavaPairRDD sortedRDD = SortedRDD.assumeSorted(rdd); Partitioner partitioner = sortedRDD.partitioner().get(); // preliminary check that the data is partitioned as we expect for this test diff --git a/refine-spark-runner/src/test/java/org/openrefine/util/RDDUtilsTests.java b/refine-spark-runner/src/test/java/org/openrefine/util/RDDUtilsTests.java index bfd5ff9ca842..baaa73292f28 100644 --- a/refine-spark-runner/src/test/java/org/openrefine/util/RDDUtilsTests.java +++ b/refine-spark-runner/src/test/java/org/openrefine/util/RDDUtilsTests.java @@ -1,7 +1,6 @@ package org.openrefine.util; -import java.io.Serializable; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -20,21 +19,6 @@ public class RDDUtilsTests extends SparkBasedTest { - @Test - public void testFilterByRange() { - JavaPairRDD rdd = rowRDD(new Serializable[][] { - { 1, 2 }, - { 3, 4 }, - { 5, 6 }, - { 7, 8 } - }); - JavaPairRDD filtered = RDDUtils.filterByRange(rdd, 2L, 5L); - List> rows = filtered.collect(); - Assert.assertEquals(rows.size(), 2); - Assert.assertEquals(rows.get(0)._2.getCellValue(0), 5); - Assert.assertEquals(rows.get(1)._2.getCellValue(1), 8); - } - @Test public void testLimitPartitionsPair() { List> list = new ArrayList<>(); @@ -132,4 +116,16 @@ public void testLimitPartitioner() { Assert.assertEquals(limited.keys().collect(), LongStream.range(0L, 6L).boxed().collect(Collectors.toList())); } + + @Test + public void testPaginateBefore() { + List list = LongStream.range(0L, 16L).boxed().collect(Collectors.toList()); + JavaRDD rdd = context().parallelize(list, 4); + JavaPairRDD pairs = RDDUtils.zipWithIndex(rdd); + List> pairsList = pairs.collect(); + + Assert.assertEquals(RDDUtils.paginateBefore(pairs, 7L, 3), pairsList.subList(4, 7)); + Assert.assertEquals(RDDUtils.paginateBefore(pairs, 3L, 8), pairsList.subList(0, 3)); + Assert.assertEquals(RDDUtils.paginateBefore(pairs, 20L, 4), pairsList.subList(12, 16)); + } } diff --git a/refine-testing/src/main/java/org/openrefine/model/DatamodelRunnerTestBase.java b/refine-testing/src/main/java/org/openrefine/model/DatamodelRunnerTestBase.java index 3cae21420c2a..e65819883e55 100644 --- a/refine-testing/src/main/java/org/openrefine/model/DatamodelRunnerTestBase.java +++ b/refine-testing/src/main/java/org/openrefine/model/DatamodelRunnerTestBase.java @@ -233,12 +233,36 @@ public void testAccessRows() { ; } + } + + @Test + public void testGetRowsAfter() { Assert.assertEquals( - state.getRows(1L, 2).stream().map(ir -> ir.getRow()).collect(Collectors.toList()), + simpleGrid.getRowsAfter(1L, 2).stream().map(ir -> ir.getRow()).collect(Collectors.toList()), expectedRows.subList(1, 3)); - Assert.assertEquals(state.getRows(5L, 3), Collections.emptyList()); - Assert.assertEquals(state.getRows(myRowFilter, SortingConfig.NO_SORTING, 0L, 2), + Assert.assertEquals(simpleGrid.getRowsAfter(myRowFilter, 0L, 2), + Arrays.asList(new IndexedRow(0L, expectedRows.get(0)), + new IndexedRow(2L, expectedRows.get(2)))); + Assert.assertEquals(simpleGrid.getRowsAfter(myRowFilter, 2L, 2), + Arrays.asList(new IndexedRow(2L, expectedRows.get(2)), + new IndexedRow(3L, expectedRows.get(3)))); + } + + /* + * { "a", "b" }, { "", 1 }, { "c", true }, { null, 123123123123L } + */ + @Test + public void testGetRowsBefore() { + Assert.assertEquals( + simpleGrid.getRowsBefore(3L, 2).stream().map(ir -> ir.getRow()).collect(Collectors.toList()), + expectedRows.subList(1, 3)); + Assert.assertEquals(simpleGrid.getRowsBefore(5L, 1), + Collections.singletonList(new IndexedRow(3L, expectedRows.get(3)))); + + Assert.assertEquals(simpleGrid.getRowsBefore(myRowFilter, 2L, 2), + Arrays.asList(new IndexedRow(0L, expectedRows.get(0)))); + Assert.assertEquals(simpleGrid.getRowsBefore(myRowFilter, 3L, 2), Arrays.asList(new IndexedRow(0L, expectedRows.get(0)), new IndexedRow(2L, expectedRows.get(2)))); } @@ -257,28 +281,11 @@ public void testGetRowsById() { } @Test - public void testAccessSortedRows() { - GridState state = gridToSort; - - Assert.assertEquals( - state.getRows(RowFilter.ANY_ROW, sortingConfig, 0, 2), - Arrays.asList( - new IndexedRow(2L, row(null, 0)), - new IndexedRow(1L, row("a", 1)))); - - Assert.assertEquals( - state.getRows(RowFilter.ANY_ROW, sortingConfig, 2, 2), - Arrays.asList( - new IndexedRow(0L, row("c", 1)), - new IndexedRow(3L, row("a", 5)))); - } - - @Test - public void testAccessSortedRowsOutOfBounds() { - GridState state = gridToSort; + public void testAccessRowsOutOfBounds() { + Assert.assertEquals(simpleGrid.getRowsAfter(5L, 3), Collections.emptyList()); Assert.assertEquals( - state.getRows(RowFilter.ANY_ROW, sortingConfig, 30, 10), + gridToSort.getRowsAfter(RowFilter.ANY_ROW, 30, 10), Collections.emptyList()); } @@ -288,13 +295,13 @@ public void testAccessSortedRowsOutOfBounds() { @Override public boolean filterRecord(Record record) { - return record.getStartRowId() == 2L; + return record.getLogicalStartRowId() == 2L; } }; @Test public void testIterateRowsFilter() { - Iterator indexedRows = simpleGrid.iterateRows(myRowFilter, SortingConfig.NO_SORTING).iterator(); + Iterator indexedRows = simpleGrid.iterateRows(myRowFilter).iterator(); Assert.assertTrue(indexedRows.hasNext()); Assert.assertEquals(indexedRows.next(), new IndexedRow(0L, expectedRows.get(0))); Assert.assertTrue(indexedRows.hasNext()); @@ -302,16 +309,6 @@ public void testIterateRowsFilter() { Assert.assertTrue(indexedRows.hasNext()); } - @Test - public void testIterateRowsSortingConfig() { - Iterator indexedRows = gridToSort.iterateRows(RowFilter.ANY_ROW, sortingConfig).iterator(); - Assert.assertTrue(indexedRows.hasNext()); - Assert.assertEquals(indexedRows.next(), new IndexedRow(2L, row(null, 0))); - Assert.assertTrue(indexedRows.hasNext()); - Assert.assertEquals(indexedRows.next(), new IndexedRow(1L, row("a", 1))); - Assert.assertTrue(indexedRows.hasNext()); - } - @Test public void testCountMatchingRows() { Assert.assertEquals(simpleGrid.countMatchingRows(myRowFilter), 3); @@ -356,10 +353,22 @@ public void testAccessRecords() { } catch (IllegalArgumentException e) { ; } + } + + @Test + public void testGetRecordsAfter() { + Assert.assertEquals(simpleGrid.getRecordsAfter(1L, 2), expectedRecords.subList(1, 2)); - Assert.assertEquals(state.getRecords(1L, 2), expectedRecords.subList(1, 2)); + Assert.assertEquals(simpleGrid.getRecordsAfter(myRecordFilter, 0L, 3), + Collections.singletonList(expectedRecords.get(1))); + } - Assert.assertEquals(state.getRecords(myRecordFilter, SortingConfig.NO_SORTING, 0L, 3), + @Test + public void testGetRecordsBefore() { + Assert.assertEquals(simpleGrid.getRecordsBefore(3L, 2), expectedRecords.subList(0, 2)); + Assert.assertEquals(simpleGrid.getRecordsBefore(0L, 2), Collections.emptyList()); + + Assert.assertEquals(simpleGrid.getRecordsBefore(myRecordFilter, 3L, 2), Collections.singletonList(expectedRecords.get(1))); } @@ -381,19 +390,6 @@ public void testRecordGroupingNoRecordStart() { noRecordStart.collectRows().stream().map(IndexedRow::getRow).collect(Collectors.toList())); } - @Test - public void testAccessSortedRecords() { - GridState state = gridToSort; - - Assert.assertEquals( - state.getRecords(RecordFilter.ANY_RECORD, sortingConfig, 0, 3), - Arrays.asList( - new Record(1L, Arrays.asList(row("a", 1), row(null, 0))), - new Record(0L, Arrays.asList(row("c", 1))), - new Record(3L, Arrays.asList(row("a", 5))))); - - } - @Test public void testRecordsRespectKeyColumnIndex() { GridState state = simpleGrid.withColumnModel(simpleGrid.getColumnModel().withKeyColumnIndex(1)); @@ -413,24 +409,12 @@ public void testRecordsRespectKeyColumnIndex() { @Test public void testIterateRecordsFilter() { - Iterator records = simpleGrid.iterateRecords(myRecordFilter, SortingConfig.NO_SORTING).iterator(); + Iterator records = simpleGrid.iterateRecords(myRecordFilter).iterator(); Assert.assertTrue(records.hasNext()); Assert.assertEquals(records.next(), expectedRecords.get(1)); Assert.assertFalse(records.hasNext()); } - @Test - public void testIterateRecordsSortingConfig() { - Iterator records = gridToSort.iterateRecords(RecordFilter.ANY_RECORD, sortingConfig).iterator(); - Assert.assertTrue(records.hasNext()); - Assert.assertEquals(records.next(), new Record(1L, Arrays.asList(row("a", 1), row(null, 0)))); - Assert.assertTrue(records.hasNext()); - Assert.assertEquals(records.next(), new Record(0L, Arrays.asList(row("c", 1)))); - Assert.assertTrue(records.hasNext()); - Assert.assertEquals(records.next(), new Record(3L, Arrays.asList(row("a", 5)))); - Assert.assertFalse(records.hasNext()); - } - @Test public void testCountMatchingRecords() { Assert.assertEquals(simpleGrid.countMatchingRecords(myRecordFilter), 1); @@ -758,8 +742,8 @@ public void testMapRecords() { } @Test - public void testReorderRows() { - GridState reordered = gridToSort.reorderRows(sortingConfig); + public void testReorderRowsPermanently() { + GridState reordered = gridToSort.reorderRows(sortingConfig, true); GridState expected = createGrid(new String[] { "foo", "bar" }, new Serializable[][] { @@ -769,12 +753,43 @@ public void testReorderRows() { { "a", 5 } }); - Assert.assertEquals(reordered.collectRows(), expected.collectRows()); + assertGridEquals(reordered, expected); + } + + protected static RowFilter rowFilterGreaterThanTwo = new RowFilter() { + + @Override + public boolean filterRow(long rowIndex, Row row) { + return rowIndex >= 2L; + } + }; + + @Test + public void testReorderRowsTemporarily() { + GridState reordered = gridToSort.reorderRows(sortingConfig, false); + + List expectedRows = Arrays.asList( + new IndexedRow(0L, 2L, new Row(Arrays.asList(null, new Cell(0, null)))), + new IndexedRow(1L, 1L, new Row(Arrays.asList(new Cell("a", null), new Cell(1, null)))), + new IndexedRow(2L, 0L, new Row(Arrays.asList(new Cell("c", null), new Cell(1, null)))), + new IndexedRow(3L, 3L, new Row(Arrays.asList(new Cell("a", null), new Cell(5, null))))); + + Assert.assertEquals(reordered.getRow(0L), expectedRows.get(0).getRow()); + Assert.assertEquals(reordered.collectRows(), expectedRows); + Assert.assertEquals(IteratorUtils.toList(reordered.iterateRows(RowFilter.ANY_ROW).iterator()), expectedRows); + + Assert.assertEquals(reordered.getRowsAfter(0L, 2), expectedRows.subList(0, 2)); + // this assertion checks that the row id used for filtering is the original one, not the new one + Assert.assertEquals(reordered.getRowsAfter(rowFilterGreaterThanTwo, 2L, 2), Collections.singletonList(expectedRows.get(3))); + + Assert.assertEquals(reordered.getRowsBefore(3L, 2), expectedRows.subList(1, 3)); + // this assertion checks that the row id used for filtering is the original one, not the new one + Assert.assertEquals(reordered.getRowsBefore(rowFilterGreaterThanTwo, 2L, 1), Collections.singletonList(expectedRows.get(0))); } @Test - public void testReorderRecords() { - GridState reordered = gridToSort.reorderRecords(sortingConfig); + public void testReorderRecordsPermanently() { + GridState reordered = gridToSort.reorderRecords(sortingConfig, true); GridState expected = createGrid(new String[] { "foo", "bar" }, new Serializable[][] { @@ -784,7 +799,31 @@ public void testReorderRecords() { { "a", 5 } }); - Assert.assertEquals(reordered.collectRows(), expected.collectRows()); + assertGridEquals(reordered, expected); + } + + @Test + public void testReorderRecordsTemporarily() { + GridState reordered = gridToSort.reorderRecords(sortingConfig, false); + + List expectedRows = Arrays.asList( + new IndexedRow(0L, 1L, new Row(Arrays.asList(new Cell("a", null), new Cell(1, null)))), + new IndexedRow(1L, 2L, new Row(Arrays.asList(null, new Cell(0, null)))), + new IndexedRow(2L, 0L, new Row(Arrays.asList(new Cell("c", null), new Cell(1, null)))), + new IndexedRow(3L, 3L, new Row(Arrays.asList(new Cell("a", null), new Cell(5, null))))); + + List expectedRecords = Arrays.asList( + new Record(0L, 1L, Arrays.asList(expectedRows.get(0).getRow(), expectedRows.get(1).getRow())), + new Record(2L, 0L, Collections.singletonList(expectedRows.get(2).getRow())), + new Record(3L, 3L, Collections.singletonList(expectedRows.get(3).getRow()))); + + Assert.assertEquals(reordered.getRecord(2L), expectedRecords.get(1)); + Assert.assertEquals(reordered.collectRows(), expectedRows); + Assert.assertEquals(reordered.collectRecords(), expectedRecords); + Assert.assertEquals(IteratorUtils.toList(reordered.iterateRecords(RecordFilter.ANY_RECORD).iterator()), expectedRecords); + + Assert.assertEquals(reordered.getRecordsAfter(2L, 2), expectedRecords.subList(1, 3)); + Assert.assertEquals(reordered.getRecordsBefore(2L, 2), expectedRecords.subList(0, 1)); } @Test @@ -796,8 +835,7 @@ public void testRemoveRows() { { "", 1 } }); - Assert.assertEquals(removed.getColumnModel(), expected.getColumnModel()); - Assert.assertEquals(removed.collectRows(), expected.collectRows()); + assertGridEquals(removed, expected); } @Test @@ -1135,8 +1173,7 @@ public void testRecordJoinChangeData() { { null, "third" } }); - Assert.assertEquals(joined.getColumnModel(), expected.getColumnModel()); - Assert.assertEquals(joined.collectRows(), expected.collectRows()); + assertGridEquals(joined, expected); } @Test @@ -1280,4 +1317,17 @@ public void testCachingWithProgress() { Assert.assertTrue(simpleGrid.isCached()); Assert.assertEquals(reporter.getPercentage(), 100); } + + /** + * Because {@link GridState} implementations are not required to use the {@link Object#equals(Object)} method to + * compare the contents of grids, we use this helper to check that two grids have the same contents. + * + * @param actual + * @param expected + */ + public static void assertGridEquals(GridState actual, GridState expected) { + Assert.assertEquals(actual.getColumnModel(), expected.getColumnModel()); + Assert.assertEquals(actual.collectRows(), expected.collectRows()); + Assert.assertEquals(actual.getOverlayModels(), expected.getOverlayModels()); + } } diff --git a/refine-testing/src/main/java/org/openrefine/model/TestingGridState.java b/refine-testing/src/main/java/org/openrefine/model/TestingGridState.java index a1b257e9e1dd..ce046d0f190f 100644 --- a/refine-testing/src/main/java/org/openrefine/model/TestingGridState.java +++ b/refine-testing/src/main/java/org/openrefine/model/TestingGridState.java @@ -19,6 +19,7 @@ import com.fasterxml.jackson.annotation.JsonIgnore; import com.google.common.collect.Iterators; +import org.apache.commons.collections.IteratorUtils; import org.testng.Assert; import org.openrefine.browsing.facets.RecordAggregator; @@ -47,6 +48,7 @@ public class TestingGridState implements GridState { private ColumnModel columnModel; private Map overlayModels; private List rows; + private List indexedRows; private List records; // the following is just to emulate the behaviour of a real implementation, @@ -55,10 +57,15 @@ public class TestingGridState implements GridState { private boolean isCached = false; public TestingGridState(ColumnModel columnModel, List rows, Map overlayModels) { + this(indexRows(rows), columnModel, overlayModels); + } + + protected TestingGridState(List indexedRows, ColumnModel columnModel, Map overlayModels) { this.columnModel = columnModel; - this.rows = rows; + this.indexedRows = indexedRows; + this.rows = indexedRows.stream().map(IndexedRow::getRow).collect(Collectors.toList()); this.overlayModels = overlayModels; - records = groupRowsIntoRecords(rows, columnModel.getKeyColumnIndex()); + records = groupRowsIntoRecords(indexedRows, columnModel.getKeyColumnIndex()); // Check that all rows have the same number of cells as the project has columns int expectedSize = columnModel.getColumns().size(); @@ -67,22 +74,8 @@ public TestingGridState(ColumnModel columnModel, List rows, Map groupRowsIntoRecords(List rows, int keyCellIndex) { - List records = new ArrayList<>(); - List currentRecord = new ArrayList<>(); - int recordStart = 0; - for (int i = 0; i < rows.size(); i++) { - if (Record.isRecordStart(rows.get(i), keyCellIndex) && !currentRecord.isEmpty()) { - records.add(new Record(recordStart, currentRecord)); - recordStart = i; - currentRecord = new ArrayList<>(); - } - currentRecord.add(rows.get(i)); - } - if (!currentRecord.isEmpty()) { - records.add(new Record(recordStart, currentRecord)); - } - return records; + public static List groupRowsIntoRecords(List rows, int keyCellIndex) { + return IteratorUtils.toList(Record.groupIntoRecords(rows.iterator(), keyCellIndex, false, Collections.emptyList())); } @Override @@ -95,39 +88,58 @@ public Row getRow(long id) { return rows.get((int) id); } - private List indexedRows() { + private static List indexRows(List rows) { return IntStream.range(0, rows.size()).mapToObj(i -> new IndexedRow((long) i, rows.get(i))).collect(Collectors.toList()); } @Override - public List getRows(long start, int limit) { - return indexedRows().subList( + public List getRowsAfter(long start, int limit) { + return indexedRows.subList( Math.min((int) start, rows.size()), Math.min((int) start + limit, rows.size())); } @Override - public List getRows(RowFilter filter, SortingConfig sortingConfig, long start, int limit) { + public List getRowsAfter(RowFilter filter, long start, int limit) { // Check that the filter is serializable as it is required by the interface, // even if this implementation does not rely on it. RowFilter deserializedFilter = TestingDatamodelRunner.serializeAndDeserialize(filter); - List sortedRows = sortedRows(sortingConfig); - return sortedRows.stream() - .filter(tuple -> deserializedFilter.filterRow(tuple.getIndex(), tuple.getRow())) - .skip(start) + return indexedRows.stream() + .filter(tuple -> deserializedFilter.filterRow(tuple.getLogicalIndex(), tuple.getRow()) && tuple.getIndex() >= start) .limit(limit) .collect(Collectors.toList()); } + @Override + public List getRowsBefore(long end, int limit) { + int actualEnd = Math.min((int) end, rows.size()); + return indexedRows.subList( + Math.max(actualEnd - limit, 0), + actualEnd); + } + + @Override + public List getRowsBefore(RowFilter filter, long end, int limit) { + // Check that the filter is serializable as it is required by the interface, + // even if this implementation does not rely on it. + RowFilter deserializedFilter = TestingDatamodelRunner.serializeAndDeserialize(filter); + + // this is really not efficient but that is not the point of this implementation: it should just be correct + List matchingRows = indexedRows.stream() + .filter(tuple -> deserializedFilter.filterRow(tuple.getLogicalIndex(), tuple.getRow()) && tuple.getIndex() < end) + .collect(Collectors.toList()); + return matchingRows.subList(Math.max(0, matchingRows.size() - limit), matchingRows.size()); + } + @Override public List collectRows() { - return indexedRows(); + return indexedRows; } @Override public Record getRecord(long id) { - List matching = getRecords(id, 1); + List matching = getRecordsAfter(id, 1); if (matching.isEmpty() || matching.get(0).getStartRowId() != id) { throw new IllegalArgumentException(String.format("No record with id %d", id)); } @@ -135,7 +147,7 @@ public Record getRecord(long id) { } @Override - public List getRecords(long start, int limit) { + public List getRecordsAfter(long start, int limit) { return records .stream() .filter(record -> record.getStartRowId() >= start) @@ -144,18 +156,34 @@ public List getRecords(long start, int limit) { } @Override - public List getRecords(RecordFilter filter, SortingConfig sortingConfig, long start, int limit) { + public List getRecordsAfter(RecordFilter filter, long start, int limit) { // Check that the filter is serializable as it is required by the interface, // even if this implementation does not rely on it. RecordFilter deserializedFilter = TestingDatamodelRunner.serializeAndDeserialize(filter); - List sorted = sortedRecords(sortingConfig); - return sorted + return records .stream() .filter(record -> record.getStartRowId() >= start && deserializedFilter.filterRecord(record)) .limit(limit) .collect(Collectors.toList()); } + @Override + public List getRecordsBefore(long end, int limit) { + return getRecordsBefore(RecordFilter.ANY_RECORD, end, limit); + } + + @Override + public List getRecordsBefore(RecordFilter filter, long end, int limit) { + // Check that the filter is serializable as it is required by the interface, + // even if this implementation does not rely on it. + RecordFilter deserializedFilter = TestingDatamodelRunner.serializeAndDeserialize(filter); + List matching = records + .stream() + .filter(record -> record.getStartRowId() < end && deserializedFilter.filterRecord(record)) + .collect(Collectors.toList()); + return matching.subList(Math.max(0, matching.size() - limit), matching.size()); + } + @Override public List collectRecords() { return records; @@ -173,7 +201,7 @@ public long recordCount() { @Override public long countMatchingRows(RowFilter filter) { - return indexedRows() + return indexedRows .stream() .filter(tuple -> filter.filterRow(tuple.getIndex(), tuple.getRow())) .count(); @@ -181,7 +209,7 @@ public long countMatchingRows(RowFilter filter) { @Override public ApproxCount countMatchingRowsApprox(RowFilter filter, long limit) { - long matching = indexedRows() + long matching = indexedRows .stream() .limit(limit) .filter(tuple -> filter.filterRow(tuple.getIndex(), tuple.getRow())) @@ -233,7 +261,7 @@ protected void saveToFile(File file, Optional progressReporter try { fos = new FileOutputStream(partFile); gos = new GZIPOutputStream(fos); - for (IndexedRow row : indexedRows()) { + for (IndexedRow row : indexedRows) { ParsingUtilities.saveWriter.writeValue(gos, row); gos.write('\n'); } @@ -276,7 +304,7 @@ public PartialAggregation aggregateRowsApprox(RowAgg T statesA = initialState; T statesB = initialState; long count = 0; - for (IndexedRow row : indexedRows()) { + for (IndexedRow row : indexedRows) { if (count == maxRows) { break; } @@ -324,7 +352,7 @@ public GridState mapRows(RowMapper mapper, ColumnModel newColumnModel) { mapper = TestingDatamodelRunner.serializeAndDeserialize(mapper); List rows = new ArrayList<>(this.rows.size()); - for (IndexedRow indexedRow : indexedRows()) { + for (IndexedRow indexedRow : indexedRows) { Row row = mapper.call(indexedRow.getIndex(), indexedRow.getRow()); if (row.getCells().size() != newColumnModel.getColumns().size()) { Assert.fail(String.format("Row size (%d) inconsistent with supplied column model (%s)", @@ -342,7 +370,7 @@ public GridState flatMapRows(RowFlatMapper mapper, ColumnModel newColumnModel) { mapper = TestingDatamodelRunner.serializeAndDeserialize(mapper); List rows = new ArrayList<>(this.rows.size()); - for (IndexedRow indexedRow : indexedRows()) { + for (IndexedRow indexedRow : indexedRows) { List rowBatch = mapper.call(indexedRow.getIndex(), indexedRow.getRow()); for (Row row : rowBatch) { if (row.getCells().size() != newColumnModel.getColumns().size()) { @@ -363,7 +391,7 @@ public GridState mapRows(RowScanMapper mapper, Colum S currentState = mapper.unit(); List rows = new ArrayList<>(this.rows.size()); - for (IndexedRow indexedRow : indexedRows()) { + for (IndexedRow indexedRow : indexedRows) { Row row = mapper.map(currentState, indexedRow.getIndex(), indexedRow.getRow()); currentState = mapper.combine(currentState, mapper.feed(indexedRow.getIndex(), indexedRow.getRow())); if (row.getCells().size() != newColumnModel.getColumns().size()) { @@ -395,15 +423,14 @@ public GridState mapRecords(RecordMapper mapper, ColumnModel newColumnModel) { } @Override - public Iterable iterateRows(RowFilter filter, SortingConfig sortingConfig) { - List sortedRows = sortedRows(sortingConfig); + public Iterable iterateRows(RowFilter filter) { return new Iterable() { @Override public Iterator iterator() { - return sortedRows + return indexedRows .stream() - .filter(ir -> filter.filterRow(ir.getIndex(), ir.getRow())) + .filter(ir -> filter.filterRow(ir.getLogicalIndex(), ir.getRow())) .iterator(); } @@ -411,13 +438,12 @@ public Iterator iterator() { } @Override - public Iterable iterateRecords(RecordFilter filter, SortingConfig sortingConfig) { - List sorted = sortedRecords(sortingConfig); + public Iterable iterateRecords(RecordFilter filter) { return new Iterable() { @Override public Iterator iterator() { - return sorted + return records .stream() .filter(r -> filter.filterRecord(r)) .iterator(); @@ -437,31 +463,39 @@ public GridState withColumnModel(ColumnModel newColumnModel) { } @Override - public GridState reorderRows(SortingConfig sortingConfig) { - return new TestingGridState(columnModel, - sortedRows(sortingConfig).stream().map(r -> r.getRow()).collect(Collectors.toList()), - overlayModels); + public GridState reorderRows(SortingConfig sortingConfig, boolean permanent) { + List newRows = sortedRows(sortingConfig); + if (permanent) { + return new TestingGridState(columnModel, newRows.stream().map(IndexedRow::getRow).collect(Collectors.toList()), overlayModels); + } else { + List indexed = IntStream.range(0, newRows.size()) + .mapToObj(i -> new IndexedRow((long) i, newRows.get(i).getLogicalIndex(), newRows.get(i).getRow())) + .collect(Collectors.toList()); + return new TestingGridState(indexed, columnModel, overlayModels); + } } @Override - public GridState reorderRecords(SortingConfig sortingConfig) { - List newRows = new ArrayList<>(rows.size()); + public GridState reorderRecords(SortingConfig sortingConfig, boolean permanent) { + List newRows = new ArrayList<>(rows.size()); if (sortingConfig.getCriteria().isEmpty()) { - newRows = rows; + newRows = indexedRows; } else { for (Record record : sortedRecords(sortingConfig)) { - newRows.addAll(record.getRows()); + for (IndexedRow row : record.getIndexedRows()) { + newRows.add(new IndexedRow(newRows.size(), permanent ? null : row.getLogicalIndex(), row.getRow())); + } } } - return new TestingGridState(columnModel, newRows, overlayModels); + return new TestingGridState(newRows, columnModel, overlayModels); } private List sortedRows(SortingConfig sortingConfig) { if (sortingConfig.equals(SortingConfig.NO_SORTING)) { - return indexedRows(); + return indexedRows; } RowSorter rowSorter = new RowSorter(this, sortingConfig); - List sortedIndexedRows = new ArrayList<>(indexedRows()); + List sortedIndexedRows = new ArrayList<>(indexedRows); Collections.sort(sortedIndexedRows, rowSorter); return sortedIndexedRows; } @@ -484,9 +518,9 @@ public DatamodelRunner getDatamodelRunner() { @Override public GridState removeRows(RowFilter filter) { - List newRows = indexedRows() + List newRows = indexedRows .stream() - .filter(ir -> !filter.filterRow(ir.getIndex(), ir.getRow())) + .filter(ir -> !filter.filterRow(ir.getLogicalIndex(), ir.getRow())) .map(ir -> ir.getRow()) .collect(Collectors.toList()); return new TestingGridState(columnModel, newRows, overlayModels); @@ -510,7 +544,7 @@ public ChangeData mapRows(RowFilter filter, RowChangeDataProducer rowM RowFilter deserializedFilter = TestingDatamodelRunner.serializeAndDeserialize(filter); Map changeData = new HashMap<>(); - Stream filteredRows = indexedRows().stream() + Stream filteredRows = indexedRows.stream() .filter(ir -> deserializedFilter.filterRow(ir.getIndex(), ir.getRow())); if (deserializedMapper.getBatchSize() == 1) { filteredRows.forEach(ir -> changeData.put(ir.getIndex(), deserializedMapper.call(ir.getIndex(), ir.getRow()))); @@ -567,7 +601,7 @@ public GridState join(ChangeData changeData, RowChangeDataJoiner rowJo // even if this implementation does not rely on it. RowChangeDataJoiner deserializedJoiner = TestingDatamodelRunner.serializeAndDeserialize(rowJoiner); - List newRows = indexedRows() + List newRows = indexedRows .stream() .map(ir -> deserializedJoiner.call(ir.getIndex(), ir.getRow(), changeData.get(ir.getIndex()))) .collect(Collectors.toList()); @@ -581,7 +615,7 @@ public GridState join(ChangeData changeData, RowChangeDataFlatJoiner r // even if this implementation does not rely on it. RowChangeDataFlatJoiner deserializedJoiner = TestingDatamodelRunner.serializeAndDeserialize(rowJoiner); - List newRows = indexedRows() + List newRows = indexedRows .stream() .flatMap(ir -> deserializedJoiner.call(ir.getIndex(), ir.getRow(), changeData.get(ir.getIndex())).stream()) .collect(Collectors.toList()); diff --git a/refine-workflow/src/main/java/org/openrefine/exporters/CustomizableTabularExporterUtilities.java b/refine-workflow/src/main/java/org/openrefine/exporters/CustomizableTabularExporterUtilities.java index e61ddcfd6467..2c6c9c971e8b 100644 --- a/refine-workflow/src/main/java/org/openrefine/exporters/CustomizableTabularExporterUtilities.java +++ b/refine-workflow/src/main/java/org/openrefine/exporters/CustomizableTabularExporterUtilities.java @@ -140,7 +140,7 @@ static public void exportRows( serializer.addRow(cells, true); } - for (IndexedRow indexedRow : grid.iterateRows(engine.combinedRowFilters(), sortingConfig)) { + for (IndexedRow indexedRow : grid.reorderRows(sortingConfig, false).iterateRows(engine.combinedRowFilters())) { List cells = new ArrayList(columnNames.size()); int nonNullCount = 0; diff --git a/refine-workflow/src/main/java/org/openrefine/importers/LineBasedImporter.java b/refine-workflow/src/main/java/org/openrefine/importers/LineBasedImporter.java index 2707f08d5b79..1540175c5887 100644 --- a/refine-workflow/src/main/java/org/openrefine/importers/LineBasedImporter.java +++ b/refine-workflow/src/main/java/org/openrefine/importers/LineBasedImporter.java @@ -18,7 +18,6 @@ import org.openrefine.model.Row; import org.openrefine.model.RowFilter; import org.openrefine.model.RowMapper; -import org.openrefine.sorting.SortingConfig; import org.openrefine.util.JSONUtilities; /** @@ -54,7 +53,7 @@ protected GridState postTransform(GridState parsed, ObjectNode options) { // so we resort to loading everything in memory List newRows = new ArrayList<>(); List currentCells = new ArrayList<>(); - for (IndexedRow row : parsed.iterateRows(RowFilter.ANY_ROW, SortingConfig.NO_SORTING)) { + for (IndexedRow row : parsed.iterateRows(RowFilter.ANY_ROW)) { currentCells.add(row.getRow().getCell(0)); if (currentCells.size() >= linesPerRow) { newRows.add(new Row(currentCells)); diff --git a/refine-workflow/src/main/java/org/openrefine/importers/LineBasedImporterBase.java b/refine-workflow/src/main/java/org/openrefine/importers/LineBasedImporterBase.java index 134e9fb6e265..c56a93cdbf80 100644 --- a/refine-workflow/src/main/java/org/openrefine/importers/LineBasedImporterBase.java +++ b/refine-workflow/src/main/java/org/openrefine/importers/LineBasedImporterBase.java @@ -166,7 +166,7 @@ public GridState parseOneFile(DatamodelRunner runner, ProjectMetadata metadata, } headerLines = 0; } else if (headerLines > 0) { - List firstLines = rawCells.getRows(ignoreLines, headerLines); + List firstLines = rawCells.getRowsAfter(ignoreLines, headerLines); for (int i = 0; i < firstLines.size(); i++) { IndexedRow headerLine = firstLines.get(i); Row mappedRow = rowMapper.call(headerLine.getIndex(), headerLine.getRow()); diff --git a/refine-workflow/src/main/java/org/openrefine/importers/SeparatorBasedImporter.java b/refine-workflow/src/main/java/org/openrefine/importers/SeparatorBasedImporter.java index 7379751704ee..c5420ec7ad49 100644 --- a/refine-workflow/src/main/java/org/openrefine/importers/SeparatorBasedImporter.java +++ b/refine-workflow/src/main/java/org/openrefine/importers/SeparatorBasedImporter.java @@ -69,7 +69,6 @@ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT import org.openrefine.model.Row; import org.openrefine.model.RowFilter; import org.openrefine.model.RowMapper; -import org.openrefine.sorting.SortingConfig; import org.openrefine.util.JSONUtilities; public class SeparatorBasedImporter extends LineBasedImporterBase { @@ -186,7 +185,7 @@ public TableDataReader createTableDataReader( final CSVParser parser = getCSVParser(options); - Iterator lines = grid.iterateRows(RowFilter.ANY_ROW, SortingConfig.NO_SORTING).iterator(); + Iterator lines = grid.iterateRows(RowFilter.ANY_ROW).iterator(); TableDataReader dataReader = new TableDataReader() { diff --git a/refine-workflow/src/main/java/org/openrefine/operations/cell/KeyValueColumnizeOperation.java b/refine-workflow/src/main/java/org/openrefine/operations/cell/KeyValueColumnizeOperation.java index b33a1c9ddfda..5562cf823fd0 100644 --- a/refine-workflow/src/main/java/org/openrefine/operations/cell/KeyValueColumnizeOperation.java +++ b/refine-workflow/src/main/java/org/openrefine/operations/cell/KeyValueColumnizeOperation.java @@ -53,7 +53,6 @@ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT import org.openrefine.model.changes.Change; import org.openrefine.model.changes.ChangeContext; import org.openrefine.operations.Operation; -import org.openrefine.sorting.SortingConfig; /** * Reshapes a table which contains key and value columns, such that the repeating contents in the key column become new @@ -146,7 +145,7 @@ public GridState apply(GridState projectState, ChangeContext context) throws Doe currentNotes.add(notes); } - for (IndexedRow indexedRow : projectState.iterateRows(RowFilter.ANY_ROW, SortingConfig.NO_SORTING)) { + for (IndexedRow indexedRow : projectState.iterateRows(RowFilter.ANY_ROW)) { Row oldRow = indexedRow.getRow(); Object key = oldRow.getCellValue(keyColumnIndex); diff --git a/refine-workflow/src/main/java/org/openrefine/operations/cell/TransposeColumnsIntoRowsOperation.java b/refine-workflow/src/main/java/org/openrefine/operations/cell/TransposeColumnsIntoRowsOperation.java index 3d24e3979ea4..2f4b67473d0e 100644 --- a/refine-workflow/src/main/java/org/openrefine/operations/cell/TransposeColumnsIntoRowsOperation.java +++ b/refine-workflow/src/main/java/org/openrefine/operations/cell/TransposeColumnsIntoRowsOperation.java @@ -55,7 +55,6 @@ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT import org.openrefine.model.changes.Change; import org.openrefine.model.changes.ChangeContext; import org.openrefine.operations.Operation; -import org.openrefine.sorting.SortingConfig; public class TransposeColumnsIntoRowsOperation implements Operation { @@ -272,7 +271,7 @@ public GridState apply(GridState projectState, ChangeContext context) throws Doe } List newRows = new ArrayList<>(); - for (IndexedRow indexedRow : projectState.iterateRows(RowFilter.ANY_ROW, SortingConfig.NO_SORTING)) { + for (IndexedRow indexedRow : projectState.iterateRows(RowFilter.ANY_ROW)) { Row oldRow = indexedRow.getRow(); RowBuilder firstNewRow = RowBuilder.create(newColumns.size()); int firstNewRowIndex = newRows.size(); diff --git a/refine-workflow/src/main/java/org/openrefine/operations/cell/TransposeRowsIntoColumnsOperation.java b/refine-workflow/src/main/java/org/openrefine/operations/cell/TransposeRowsIntoColumnsOperation.java index c8bf3d19eaaf..102988b0f76f 100644 --- a/refine-workflow/src/main/java/org/openrefine/operations/cell/TransposeRowsIntoColumnsOperation.java +++ b/refine-workflow/src/main/java/org/openrefine/operations/cell/TransposeRowsIntoColumnsOperation.java @@ -52,7 +52,6 @@ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT import org.openrefine.model.changes.Change; import org.openrefine.model.changes.ChangeContext; import org.openrefine.operations.Operation; -import org.openrefine.sorting.SortingConfig; public class TransposeRowsIntoColumnsOperation implements Operation { @@ -119,7 +118,7 @@ public GridState apply(GridState projectState, ChangeContext context) throws Doe int nbNewColumns = newColumns.getColumns().size(); RowBuilder firstNewRow = null; List newRows = new ArrayList<>(); - for (IndexedRow indexedRow : projectState.iterateRows(RowFilter.ANY_ROW, SortingConfig.NO_SORTING)) { + for (IndexedRow indexedRow : projectState.iterateRows(RowFilter.ANY_ROW)) { long r = indexedRow.getIndex(); int r2 = (int) (r % (long) _rowCount); diff --git a/refine-workflow/src/main/java/org/openrefine/operations/row/RowReorderOperation.java b/refine-workflow/src/main/java/org/openrefine/operations/row/RowReorderOperation.java index c5b0c6c703b6..7782eef80b6f 100644 --- a/refine-workflow/src/main/java/org/openrefine/operations/row/RowReorderOperation.java +++ b/refine-workflow/src/main/java/org/openrefine/operations/row/RowReorderOperation.java @@ -92,9 +92,9 @@ public boolean isImmediate() { @Override public GridState apply(GridState projectState, ChangeContext context) throws DoesNotApplyException { if (Mode.RowBased.equals(_mode)) { - return projectState.reorderRows(_sorting); + return projectState.reorderRows(_sorting, true); } else { - return projectState.reorderRecords(_sorting); + return projectState.reorderRecords(_sorting, true); } }