From dbffc78703b2e641d05426a104b65277352d03c9 Mon Sep 17 00:00:00 2001 From: Joey Salazar Date: Thu, 20 Oct 2022 05:24:32 -0600 Subject: [PATCH 1/2] Reconcile filenames in project creation Fixes #56 --- .../commons/importer/FileRecordToRows.java | 46 +++++++++++++++++-- .../importer/FileRecordToRowsTest.java | 4 +- 2 files changed, 45 insertions(+), 5 deletions(-) diff --git a/src/main/java/org/openrefine/extensions/commons/importer/FileRecordToRows.java b/src/main/java/org/openrefine/extensions/commons/importer/FileRecordToRows.java index 9be2e03..da11bc7 100644 --- a/src/main/java/org/openrefine/extensions/commons/importer/FileRecordToRows.java +++ b/src/main/java/org/openrefine/extensions/commons/importer/FileRecordToRows.java @@ -2,17 +2,31 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.Collections; import java.util.Iterator; import java.util.List; +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.google.refine.expr.ExpressionUtils; import com.google.refine.importers.TabularImportingParserBase.TableDataReader; +import com.google.refine.model.Cell; +import com.google.refine.model.Recon; +import com.google.refine.model.ReconCandidate; +import com.google.refine.model.Recon.Judgment; +import com.google.refine.model.recon.StandardReconConfig; -/* +/** * This class takes an Iterator and converts each FileRecord to one or more rows * * @param iteratorFileRecords */ public class FileRecordToRows implements TableDataReader { + + protected StandardReconConfig reconConfig; + + protected String identifierSpace; + protected String schemaSpace; + protected String service; final Iterator iteratorFileRecords; FileRecord fileRecord; final boolean categoriesColumn; @@ -24,10 +38,14 @@ public FileRecordToRows(Iterator iteratorFileRecords, boolean catego this.iteratorFileRecords = iteratorFileRecords; this.categoriesColumn = categoriesColumn; this.mIdsColumn = mIdsColumn; + this.identifierSpace = "https://commons.wikimedia.org/entity/"; + this.schemaSpace = "http://www.wikidata.org/prop/direct/"; + this.service = "https://commonsreconcile.toolforge.org/en/api"; + this.reconConfig = new StandardReconConfig(service, identifierSpace, schemaSpace, null, null, true, Collections.emptyList()); } - /* + /** * This method iterates over the parameters of a file record spreading them in rows * * @return a row containing a cell per file record parameter @@ -49,7 +67,29 @@ public List getNextRowOfCells() throws IOException { } else if (iteratorFileRecords.hasNext()) { fileRecord = iteratorFileRecords.next(); relatedCategoriesIndex = 0; - rowsOfCells.add(fileRecord.fileName); + if (fileRecord.fileName != null && ExpressionUtils.isNonBlankData(fileRecord.fileName)) { + String id = "M" + fileRecord.pageId; + if(id.startsWith(identifierSpace)) { + id = id.substring(identifierSpace.length()); + } + + ReconCandidate match = new ReconCandidate(id, fileRecord.fileName, new String[0], 100); + Recon newRecon = reconConfig.createNewRecon(0); + newRecon.match = match; + newRecon.candidates = Collections.singletonList(match); + newRecon.matchRank = -1; + newRecon.judgment = Judgment.Matched; + newRecon.judgmentAction = "mass"; + newRecon.judgmentBatchSize = 1; + + Cell newCell = new Cell( + fileRecord.fileName, + newRecon + ); + + rowsOfCells.add(newCell); + } + if (mIdsColumn) { rowsOfCells.add("M" + fileRecord.pageId); } diff --git a/src/test/java/org/openrefine/extensions/commons/importer/FileRecordToRowsTest.java b/src/test/java/org/openrefine/extensions/commons/importer/FileRecordToRowsTest.java index 268fe35..b96b787 100644 --- a/src/test/java/org/openrefine/extensions/commons/importer/FileRecordToRowsTest.java +++ b/src/test/java/org/openrefine/extensions/commons/importer/FileRecordToRowsTest.java @@ -28,10 +28,10 @@ public void testGetNextRowOfCells() throws Exception { rows.add(frtr.getNextRowOfCells()); rows.add(frtr.getNextRowOfCells()); - Assert.assertEquals(rows.get(0), Arrays.asList("File:LasTres.jpg", "M127722", "Category:Costa Rica")); + Assert.assertEquals(rows.get(0).toString(), Arrays.asList("File:LasTres.jpg, M127722, Category:Costa Rica").toString()); Assert.assertEquals(rows.get(1), Arrays.asList(null, null, "Category:Cute dogs")); Assert.assertEquals(rows.get(2), Arrays.asList(null, null, "Category:Costa Rican dogs")); - Assert.assertEquals(rows.get(3), Arrays.asList("File:Playa Gandoca.jpg", "M112933", null)); + Assert.assertEquals(rows.get(3).toString(), Arrays.asList("File:Playa Gandoca.jpg", "M112933", null).toString()); Assert.assertEquals(rows.get(4), null); } From 52a1475b9009ec4e21c6d7315150ebf9e9de2aba Mon Sep 17 00:00:00 2001 From: Joey Salazar Date: Thu, 20 Oct 2022 10:43:20 -0600 Subject: [PATCH 2/2] Add reconciliation statistics --- .../commons/importer/CommonsImporter.java | 19 +++++++++++++++++++ .../importer/FileRecordToRowsTest.java | 2 +- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/openrefine/extensions/commons/importer/CommonsImporter.java b/src/main/java/org/openrefine/extensions/commons/importer/CommonsImporter.java index acc9e63..745b12e 100644 --- a/src/main/java/org/openrefine/extensions/commons/importer/CommonsImporter.java +++ b/src/main/java/org/openrefine/extensions/commons/importer/CommonsImporter.java @@ -12,7 +12,11 @@ import com.google.refine.ProjectMetadata; import com.google.refine.importers.TabularImportingParserBase; import com.google.refine.importing.ImportingJob; +import com.google.refine.model.Column; import com.google.refine.model.Project; +import com.google.refine.model.ReconStats; +import com.google.refine.model.recon.StandardReconConfig; +import com.google.refine.model.recon.StandardReconConfig.ColumnDetail; import com.google.refine.util.JSONUtilities; public class CommonsImporter { @@ -57,6 +61,7 @@ static public void parse( category.get("depth").asInt())); } String apiUrl = "https://commons.wikimedia.org/w/api.php";//FIXME + String service = "https://commonsreconcile.toolforge.org/en/api"; // initializes progress reporting with the name of the first category setProgress(job, categoriesWithDepth.get(0).categoryName, 0); @@ -79,6 +84,20 @@ static public void parse( options, exceptions ); + + Column col = project.columnModel.columns.get(0); + StandardReconConfig cfg = new StandardReconConfig( + service, + "https://commons.wikimedia.org/entity/", + "http://www.wikidata.org/prop/direct/", + "", + "entity", + true, + new ArrayList(), + 1); + col.setReconStats(ReconStats.create(project, 0)); + col.setReconConfig(cfg); + setProgress(job, categoriesWithDepth.get(categoriesWithDepth.size()-1).categoryName, 100); } diff --git a/src/test/java/org/openrefine/extensions/commons/importer/FileRecordToRowsTest.java b/src/test/java/org/openrefine/extensions/commons/importer/FileRecordToRowsTest.java index b96b787..adaa19f 100644 --- a/src/test/java/org/openrefine/extensions/commons/importer/FileRecordToRowsTest.java +++ b/src/test/java/org/openrefine/extensions/commons/importer/FileRecordToRowsTest.java @@ -28,7 +28,7 @@ public void testGetNextRowOfCells() throws Exception { rows.add(frtr.getNextRowOfCells()); rows.add(frtr.getNextRowOfCells()); - Assert.assertEquals(rows.get(0).toString(), Arrays.asList("File:LasTres.jpg, M127722, Category:Costa Rica").toString()); + Assert.assertEquals(rows.get(0).toString(), Arrays.asList("File:LasTres.jpg", "M127722", "Category:Costa Rica").toString()); Assert.assertEquals(rows.get(1), Arrays.asList(null, null, "Category:Cute dogs")); Assert.assertEquals(rows.get(2), Arrays.asList(null, null, "Category:Costa Rican dogs")); Assert.assertEquals(rows.get(3).toString(), Arrays.asList("File:Playa Gandoca.jpg", "M112933", null).toString());