-
Notifications
You must be signed in to change notification settings - Fork 100
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #41 from Wikidata/dump-processing-example
Complete dump processing pipeline + example
- Loading branch information
Showing
10 changed files
with
485 additions
and
41 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
40 changes: 40 additions & 0 deletions
40
wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/EntityDocumentProcessor.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
package org.wikidata.wdtk.dumpfiles; | ||
|
||
import org.wikidata.wdtk.datamodel.interfaces.EntityDocument; | ||
import org.wikidata.wdtk.datamodel.interfaces.ItemDocument; | ||
import org.wikidata.wdtk.datamodel.interfaces.PropertyDocument; | ||
|
||
/** | ||
* Interface for classes that are able to process {@link EntityDocument} objects | ||
* in some way. Classes that implement this can subscribe to receive entity | ||
* documents as obtained, e.g., from parsing dump files. | ||
* | ||
* @author Markus Kroetzsch | ||
* | ||
*/ | ||
public interface EntityDocumentProcessor { | ||
|
||
/** | ||
* Processes the given ItemDocument. | ||
* | ||
* @param itemDocument | ||
* the ItemDocument | ||
*/ | ||
public void processItemDocument(ItemDocument itemDocument); | ||
|
||
/** | ||
* Processes the given PropertyDocument. | ||
* | ||
* @param propertyDocument | ||
* the PropertyDocument | ||
*/ | ||
public void processPropertyDocument(PropertyDocument propertyDocument); | ||
|
||
/** | ||
* Performs final actions that should be done after all entity documents in | ||
* a batch of entity documents have been processed. This is usually called | ||
* after a whole dumpfile was completely processed. | ||
*/ | ||
public void finishProcessingEntityDocuments(); | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
88 changes: 88 additions & 0 deletions
88
wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/WikibaseRevisionProcessor.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
package org.wikidata.wdtk.dumpfiles; | ||
|
||
import java.util.Map; | ||
|
||
import org.json.JSONException; | ||
import org.json.JSONObject; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
import org.wikidata.wdtk.datamodel.implementation.DataObjectFactoryImpl; | ||
import org.wikidata.wdtk.datamodel.interfaces.DataObjectFactory; | ||
import org.wikidata.wdtk.datamodel.interfaces.ItemDocument; | ||
import org.wikidata.wdtk.datamodel.interfaces.PropertyDocument; | ||
|
||
/** | ||
* A revision processor that processes Wikibase entity content from a dump file. | ||
* Revisions are parsed to obtain EntityDocument objects. | ||
* | ||
* @author Markus Kroetzsch | ||
* | ||
*/ | ||
public class WikibaseRevisionProcessor implements MwRevisionProcessor { | ||
|
||
static final Logger logger = LoggerFactory | ||
.getLogger(WikibaseRevisionProcessor.class); | ||
|
||
JsonConverter jsonConverter; | ||
final DataObjectFactory dataObjectFactory; | ||
final EntityDocumentProcessor entityDocumentProcessor; | ||
|
||
public WikibaseRevisionProcessor( | ||
EntityDocumentProcessor entityDocumentProcessor) { | ||
this.dataObjectFactory = new DataObjectFactoryImpl(); | ||
this.entityDocumentProcessor = entityDocumentProcessor; | ||
} | ||
|
||
@Override | ||
public void startRevisionProcessing(String siteName, String baseUrl, | ||
Map<Integer, String> namespaces) { | ||
// FIXME the baseUrl from the dump is not the baseIri we need here | ||
this.jsonConverter = new JsonConverter(baseUrl, this.dataObjectFactory); | ||
} | ||
|
||
@Override | ||
public void processRevision(MwRevision mwRevision) { | ||
if (MwRevision.MODEL_WIKIBASE_ITEM.equals(mwRevision.getModel())) { | ||
processItemRevision(mwRevision); | ||
} else if (MwRevision.MODEL_WIKIBASE_PROPERTY.equals(mwRevision | ||
.getModel())) { | ||
processPropertyRevision(mwRevision); | ||
} // else: ignore this revision | ||
} | ||
|
||
public void processItemRevision(MwRevision mwRevision) { | ||
try { | ||
JSONObject jsonObject = new JSONObject(mwRevision.getText()); | ||
ItemDocument itemDocument = this.jsonConverter | ||
.convertToItemDocument(jsonObject, mwRevision.getTitle()); | ||
this.entityDocumentProcessor.processItemDocument(itemDocument); | ||
} catch (JSONException e) { | ||
WikibaseRevisionProcessor.logger | ||
.error("Failed to process JSON for item " | ||
+ mwRevision.toString() + " (" + e.toString() + ")"); | ||
} | ||
|
||
} | ||
|
||
public void processPropertyRevision(MwRevision mwRevision) { | ||
try { | ||
JSONObject jsonObject = new JSONObject(mwRevision.getText()); | ||
PropertyDocument propertyDocument = this.jsonConverter | ||
.convertToPropertyDocument(jsonObject, | ||
mwRevision.getTitle()); | ||
this.entityDocumentProcessor | ||
.processPropertyDocument(propertyDocument); | ||
} catch (JSONException e) { | ||
WikibaseRevisionProcessor.logger | ||
.error("Failed to process JSON for property " | ||
+ mwRevision.toString() + " (" + e.toString() + ")"); | ||
} | ||
|
||
} | ||
|
||
@Override | ||
public void finishRevisionProcessing() { | ||
// Nothing to do | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.