Merge branch 'master' into create-parser-of-constraints

Wikidata · Nov 10, 2017 · af4d035 · af4d035
2 parents d7e2085 + c344dfa
commit af4d035
Show file tree

Hide file tree

Showing 20 changed files with 1,616 additions and 26 deletions.
diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md
@@ -1,6 +1,24 @@
 Wikidata Toolkit Release Notes
 ==============================
 
+Version 0.8.0
+-------------
+
+New features:
+* Compatibility with JDK 9
+* Allow to edit labels, descriptions and aliases using the WikibaseDataEditor (this is a work in progress that is likely to change)
+* Allow to use the wbEntitySearch API action using WikibaseDataFetcher
+* Quantities bounds are now optional following the change in Wikibase
+* Add the "id" field to entity id JSON serialization following the change in Wikibase
+
+Bug fixes:
+* Do not fail when logging in
+* Do not fail when reading redirections in daily XML dumps
+* Do not fail when new datatypes are introduced in Wikibase
+* Make sure that API warnings are read for all requests
+* Do not fail when reading a bz2 compressed dump when a gzip dump was expected
+* WikibaseDataFetcher is now able to retrieve more than 50 entities at once
+
 Version 0.7.0
 -------------
 

diff --git a/...atamodel/src/main/java/org/wikidata/wdtk/datamodel/interfaces/WikimediaLanguageCodes.java b/...atamodel/src/main/java/org/wikidata/wdtk/datamodel/interfaces/WikimediaLanguageCodes.java
@@ -45,6 +45,8 @@ public class WikimediaLanguageCodes {
 		LANGUAGE_CODES.put("aa", "aa"); // Afar
 		LANGUAGE_CODES.put("ab", "ab"); // Abkhazian
 		LANGUAGE_CODES.put("ace", "ace");
+		LANGUAGE_CODES.put("aeb-arab", "aeb-Arab");
+		LANGUAGE_CODES.put("ady", "ady");
 		LANGUAGE_CODES.put("af", "af");
 		LANGUAGE_CODES.put("ak", "ak");
 		LANGUAGE_CODES.put("aln", "aln");
@@ -63,6 +65,7 @@ public class WikimediaLanguageCodes {
 		LANGUAGE_CODES.put("avk", "avk");
 		LANGUAGE_CODES.put("ay", "ay");
 		LANGUAGE_CODES.put("az", "az");
+		LANGUAGE_CODES.put("azb", "azb");
 		LANGUAGE_CODES.put("ba", "ba");
 		LANGUAGE_CODES.put("bar", "bar");
 		LANGUAGE_CODES.put("bat-smg", "sgs"); // TODO might be redundant
@@ -121,6 +124,7 @@ public class WikimediaLanguageCodes {
 														// for formal German
 		LANGUAGE_CODES.put("diq", "diq");
 		LANGUAGE_CODES.put("dsb", "dsb");
+		LANGUAGE_CODES.put("dty", "dty");
 		LANGUAGE_CODES.put("dv", "dv");
 		LANGUAGE_CODES.put("dz", "dz");
 		LANGUAGE_CODES.put("ee", "ee");
@@ -165,6 +169,7 @@ public class WikimediaLanguageCodes {
 		LANGUAGE_CODES.put("glk", "glk");
 		LANGUAGE_CODES.put("gn", "gn");
 		LANGUAGE_CODES.put("got", "got");
+		LANGUAGE_CODES.put("gom", "gom");
 		LANGUAGE_CODES.put("grc", "grc");
 		LANGUAGE_CODES.put("gsw", "gsw");
 		LANGUAGE_CODES.put("gu", "gu");
@@ -208,6 +213,7 @@ public class WikimediaLanguageCodes {
 		LANGUAGE_CODES.put("kab", "kab");
 		LANGUAGE_CODES.put("ka", "ka");
 		LANGUAGE_CODES.put("kbd", "kbd");
+		LANGUAGE_CODES.put("kbp", "kbp");
 		LANGUAGE_CODES.put("kg", "kg");
 		LANGUAGE_CODES.put("ki", "ki");
 		LANGUAGE_CODES.put("kj", "kj");
@@ -264,6 +270,7 @@ public class WikimediaLanguageCodes {
 		LANGUAGE_CODES.put("lt", "lt");
 		LANGUAGE_CODES.put("lv", "lv");
 		LANGUAGE_CODES.put("lzh", "lzh"); // Literary Chinese
+		LANGUAGE_CODES.put("lzz", "lzz");
 		LANGUAGE_CODES.put("mai", "mai");
 		LANGUAGE_CODES.put("map-bms", "jv-x-bms"); // Basa Banyumasan has no
 													// code; jv is a superset
@@ -316,6 +323,7 @@ public class WikimediaLanguageCodes {
 		LANGUAGE_CODES.put("nv", "nv");
 		LANGUAGE_CODES.put("ny", "ny");
 		LANGUAGE_CODES.put("oc", "oc");
+		LANGUAGE_CODES.put("olo", "olo");
 		LANGUAGE_CODES.put("om", "om");
 		LANGUAGE_CODES.put("or", "or");
 		LANGUAGE_CODES.put("os", "os");
@@ -352,6 +360,7 @@ public class WikimediaLanguageCodes {
 		LANGUAGE_CODES.put("rup", "rup"); // Macedo-Romanian/Aromanian
 		LANGUAGE_CODES.put("ru", "ru");
 		LANGUAGE_CODES.put("rw", "rw");
+		LANGUAGE_CODES.put("rwr", "rwr");
 		LANGUAGE_CODES.put("sah", "sah");
 		LANGUAGE_CODES.put("sa", "sa");
 		LANGUAGE_CODES.put("scn", "scn");
@@ -392,6 +401,7 @@ public class WikimediaLanguageCodes {
 		LANGUAGE_CODES.put("sw", "sw");
 		LANGUAGE_CODES.put("szl", "szl");
 		LANGUAGE_CODES.put("ta", "ta");
+		LANGUAGE_CODES.put("tcy", "tcy");
 		LANGUAGE_CODES.put("te", "te");
 		LANGUAGE_CODES.put("tet", "tet");
 		LANGUAGE_CODES.put("tg", "tg");

diff --git a/...datamodel/src/main/java/org/wikidata/wdtk/datamodel/json/jackson/AliasesDeserializer.java b/...datamodel/src/main/java/org/wikidata/wdtk/datamodel/json/jackson/AliasesDeserializer.java
@@ -75,8 +75,6 @@ public Map<String, List<JacksonMonolingualTextValue>> deserialize(
 				}
 			}
 
-		} catch (JsonProcessingException e) {
-			e.printStackTrace();
 		} catch (Exception e) {
 			e.printStackTrace();
 		}

diff --git a/wdtk-examples/src/main/java/org/wikidata/wdtk/examples/FetchOnlineDataExample.java b/wdtk-examples/src/main/java/org/wikidata/wdtk/examples/FetchOnlineDataExample.java
@@ -27,6 +27,7 @@
 import org.wikidata.wdtk.datamodel.interfaces.EntityDocument;
 import org.wikidata.wdtk.datamodel.interfaces.ItemDocument;
 import org.wikidata.wdtk.datamodel.interfaces.PropertyIdValue;
+import org.wikidata.wdtk.wikibaseapi.WbSearchEntitiesResult;
 import org.wikidata.wdtk.wikibaseapi.WikibaseDataFetcher;
 import org.wikidata.wdtk.wikibaseapi.apierrors.MediaWikiApiErrorException;
 
@@ -94,6 +95,11 @@ public static void main(String[] args) throws MediaWikiApiErrorException {
 							+ entry.getValue().getEntityId().getId());
 		}
 
+		System.out.println("** Doing search on Wikidata:");
+		for(WbSearchEntitiesResult result : wbdf.searchEntities("Douglas Adams", "fr")) {
+			System.out.println("Found " + result.getEntityId() + " with label " + result.getLabel());
+		}
+
 		System.out.println("*** Done.");
 	}
 

diff --git a/wdtk-examples/src/main/java/org/wikidata/wdtk/examples/TutorialDocumentProcessor.java b/wdtk-examples/src/main/java/org/wikidata/wdtk/examples/TutorialDocumentProcessor.java
@@ -0,0 +1,168 @@
+package org.wikidata.wdtk.examples;
+
+/*
+ * #%L
+ * Wikidata Toolkit Examples
+ * %%
+ * Copyright (C) 2014 Wikidata Toolkit Developers
+ * %%
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * #L%
+ */
+
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.PrintStream;
+
+import org.wikidata.wdtk.datamodel.interfaces.EntityDocumentProcessor;
+import org.wikidata.wdtk.datamodel.interfaces.ItemDocument;
+import org.wikidata.wdtk.datamodel.interfaces.PropertyDocument;
+import org.wikidata.wdtk.datamodel.interfaces.TimeValue;
+
+/**
+ * This is a simple template for an {@link EntityDocumentProcessor} that can be
+ * modified to try your own code.
+ * <p>
+ * Exercise 1: Just run the code as it is and have a look at the output. It will
+ * print a lot of data about item documents to the console. You can see roughly
+ * what the data looks like. Find the data for one item and look up the item on
+ * wikidata.org. Find the data that you can see on the Web page in the print out
+ * (note that some details might have changed since you local data is based on a
+ * dump).
+ * <p>
+ * Exercise 2: The code below already counts how many items and properties it
+ * processes. Add additional counters to count: (1) the number of labels, (2)
+ * the number of aliases, (3) the number of statements, (4) the number of site
+ * links. Print this data at the end or write it to the file if you like.
+ * <p>
+ * Exercise 3: Extend your code from Exercise 2 to count how many items have a
+ * link to English Wikipedia (or another Wikipedia of your choice). The site
+ * identifier used in the data for English Wikipedia is "enwiki".
+ * <p>
+ * Exercise 4: Building on the code of Exercise 3, count the number of site
+ * links for all sites that are linked. Use, for example, a hashmap to store
+ * integer counters for each site id you encounter. Print the results to a CSV
+ * file and load the file into a spreadsheet application (this can also be an
+ * online application such as Google Drive). You can order the data by count and
+ * create a diagram. The number of site links should be close to the number of
+ * articles in the project.
+ * <p>
+ * Exercise 5: Compute the average life expectancy of people on Wikidata. To do
+ * this, consider items with a birth date (P569) and death date (P570). Whenever
+ * both dates are found, compute the difference of years between the dates.
+ * Store the sum of these lifespans (in years) and the number of people for
+ * which you recorded a lifespace to compute the average. Some hints:
+ * <ul>
+ * <li>There can be more than one statement for any property, even for date of
+ * birth/death (if there are different opinions). For simplicity, just use the
+ * first.</li>
+ * <li>Dates can be uncertain. This is expressed by their precision,
+ * {@link TimeValue#getPrecision()}. You should only consider values with
+ * precision greater or equal to {@link TimeValue#PREC_DAY}.</li>
+ * </ul>
+ * <p>
+ * Exercise 6: Compute the average life span as in Exercise 5, but now grouped
+ * by year of birth. This will show you how life expectancy changed over time
+ * (at least for people with Wikipedia articles). For this, create arrays or
+ * maps to store the sum of the lifespan and number of people for each year of
+ * birth. Finally, compute all the averages and store them to a CSV file that
+ * gives the average life expectancy for each year of birth. Load this file into
+ * a spreadsheet too to create a diagram. What do you notice? Some hints:
+ * <ul>
+ * <li>An easy way to store the numbers you need for each year of birth is to
+ * use an array where the year is the index. This is possible here since you
+ * know that years should be in a certain range. You could also use a Hashmap,
+ * of course, but sorting by key is more work in this case.</li>
+ * <li>The data can contain errors. If you see strange effects in the results,
+ * maybe you need to filter some unlikely cases.</li>
+ * <li>To get a smooth trend for life expectancy, you need to have at least a
+ * few people for every year of birth. It might be a good idea to consider only
+ * people born after the year 1200 to make sure that you have enough precise
+ * data.</li>
+ * </ul>
+ *
+ * @author Markus Kroetzsch
+ *
+ */
+public class TutorialDocumentProcessor implements EntityDocumentProcessor {
+	private long countItems = 0;
+	private long countProperties = 0;
+
+	/**
+	 * Processes one item document. This is often the main workhorse that
+	 * gathers the data you are interested in. You can modify this code as you
+	 * wish.
+	 */
+	@Override
+	public void processItemDocument(ItemDocument itemDocument) {
+		this.countItems++;
+
+		// Do some printing for demonstration/debugging.
+		// Only print at most 50 items (or it would get too slow).
+		if (this.countItems < 10) {
+			System.out.println(itemDocument);
+		} else if (this.countItems == 10) {
+			System.out.println("*** I won't print any further items.\n"
+					+ "*** We will never finish if we print all the items.\n"
+					+ "*** Maybe remove this debug output altogether.");
+		}
+	}
+
+	/**
+	 * Processes one property document. Property documents mainly tell you the
+	 * name and datatype of properties. It can be useful to process all
+	 * properties first to store data about them that is useful when processing
+	 * items. There are not very many properties (about 1100 as of August 2014),
+	 * so it is safe to store all their data for later use.
+	 */
+	@Override
+	public void processPropertyDocument(PropertyDocument propertyDocument) {
+		this.countProperties++;
+
+		// For testing; disable when no longer needed:
+		if (this.countProperties < 10) {
+			System.out.println(propertyDocument);
+		} else if (this.countItems == 10) {
+			System.out
+					.println("*** I won't print any further properties.\n"
+							+ "*** Otherwise you would see only properties and no items.\n"
+							+ "*** Maybe remove this debug output altogether.");
+		}
+	}
+
+	/**
+	 * Stores the processing results in a file. CSV (comma separated values) is
+	 * a simple format that makes sense for such tasks. It can be imported
+	 * easily into spreadsheet tools to generate diagrams from the data.
+	 */
+	public void storeResults() {
+		System.out.println("Processed " + countItems + " items and "
+				+ countProperties + " properties in total.");
+		System.out.println("Storing data ...");
+
+		try (PrintStream out = new PrintStream(new FileOutputStream(
+				"tutorial-results.csv"))) {
+			// Two simple entries for demonstration purposes.
+			// Use your own code when you have more interesting data.
+			out.println("count of items," + countItems);
+			out.println("count of properties," + countProperties);
+		} catch (IOException e) {
+			System.out.println("Oops, I could not write the file: "
+					+ e.toString());
+		}
+
+		System.out.println("... data stored.");
+
+	}
+
+}
diff --git a/wdtk-examples/src/main/java/org/wikidata/wdtk/examples/TutorialExample.java b/wdtk-examples/src/main/java/org/wikidata/wdtk/examples/TutorialExample.java
@@ -0,0 +1,72 @@
+package org.wikidata.wdtk.examples;
+
+/*
+ * #%L
+ * Wikidata Toolkit Examples
+ * %%
+ * Copyright (C) 2014 Wikidata Toolkit Developers
+ * %%
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * #L%
+ */
+
+import org.wikidata.wdtk.datamodel.interfaces.EntityDocumentProcessor;
+import org.wikidata.wdtk.dumpfiles.DumpProcessingController;
+import org.wikidata.wdtk.dumpfiles.MwRevision;
+import org.wikidata.wdtk.dumpfiles.StatisticsMwRevisionProcessor;
+
+/**
+ * This example application applies an {@link EntityDocumentProcessor} to all
+ * documents in a Wikidata dump file. By default, the EntityDocumentProcessor is
+ * {@link TutorialDocumentProcessor}.
+ * <p>
+ * This application is based on the regular data exports provide by Wikidata. By
+ * default, it will run in offline mode. This will only work if you already have
+ * some dump downloaded before. The easiest way of doing this is to disable
+ * offline mode in the source code; the program will then do the downloading for
+ * you.
+ *
+ * @author Markus Kroetzsch
+ *
+ */
+public class TutorialExample {
+
+	public static void main(String[] args) {
+		ExampleHelpers.configureLogging();
+
+		// Controller object for processing dumps:
+		DumpProcessingController dumpProcessingController = new DumpProcessingController(
+				"wikidatawiki");
+		// Work offline. Only works if you already have a dump downloaded
+		dumpProcessingController.setOfflineMode(true);
+
+		// Example processor for item and property documents:
+		TutorialDocumentProcessor documentProcessor = new TutorialDocumentProcessor();
+
+		dumpProcessingController.registerEntityDocumentProcessor(
+				documentProcessor, MwRevision.MODEL_WIKIBASE_ITEM, true);
+		dumpProcessingController.registerEntityDocumentProcessor(
+				documentProcessor, MwRevision.MODEL_WIKIBASE_PROPERTY, true);
+
+		// Another processor for statistics & time keeping:
+		dumpProcessingController.registerMwRevisionProcessor(
+				new StatisticsMwRevisionProcessor("statistics", 10000), null,
+				true);
+
+		// Run the processing:
+		dumpProcessingController.processMostRecentMainDump();
+
+		// Store the results:
+		documentProcessor.storeResults();
+	}
+}