diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md index 917031078..0a571e6fe 100644 --- a/RELEASE-NOTES.md +++ b/RELEASE-NOTES.md @@ -5,11 +5,13 @@ Version 0.2.0 ------------- New features: +* Support for resolving site links, based on information from the sites table dump + (as demonstrated in a new example program) * Support for SnakGroups (data model updated to group Snaks by property in all lists) Bug fixes: * Support SomeValueSnak and NoValueSnak in references (Issue #44) - +* Use correct site links when importing data from dumps (Issue #37) Version 0.1.0 diff --git a/wdtk-datamodel/src/main/java/org/wikidata/wdtk/datamodel/implementation/DataObjectFactoryImpl.java b/wdtk-datamodel/src/main/java/org/wikidata/wdtk/datamodel/implementation/DataObjectFactoryImpl.java index 23e8c1a32..9462f0c2d 100644 --- a/wdtk-datamodel/src/main/java/org/wikidata/wdtk/datamodel/implementation/DataObjectFactoryImpl.java +++ b/wdtk-datamodel/src/main/java/org/wikidata/wdtk/datamodel/implementation/DataObjectFactoryImpl.java @@ -150,9 +150,9 @@ public StatementGroup getStatementGroup(List statements) { } @Override - public SiteLink getSiteLink(String title, String siteKey, String baseIri, + public SiteLink getSiteLink(String title, String siteKey, List badges) { - return new SiteLinkImpl(title, siteKey, baseIri, badges); + return new SiteLinkImpl(title, siteKey, badges); } @Override diff --git a/wdtk-datamodel/src/main/java/org/wikidata/wdtk/datamodel/implementation/SiteLinkImpl.java b/wdtk-datamodel/src/main/java/org/wikidata/wdtk/datamodel/implementation/SiteLinkImpl.java index 449593585..c2e3e99e5 100644 --- a/wdtk-datamodel/src/main/java/org/wikidata/wdtk/datamodel/implementation/SiteLinkImpl.java +++ b/wdtk-datamodel/src/main/java/org/wikidata/wdtk/datamodel/implementation/SiteLinkImpl.java @@ -20,8 +20,6 @@ * #L% */ -import java.io.UnsupportedEncodingException; -import java.net.URLEncoder; import java.util.Collections; import java.util.List; @@ -42,32 +40,31 @@ public class SiteLinkImpl implements SiteLink { final String title; final String siteKey; - final String baseIri; final List badges; /** * Constructor. * * @param title + * the title string of the linked page, including namespace + * prefixes if any * @param siteKey - * @param baseIri + * the string key of the site of the linked article * @param badges + * the list of badges of the linked article */ - SiteLinkImpl(String title, String siteKey, String baseIri, - List badges) { + SiteLinkImpl(String title, String siteKey, List badges) { Validate.notNull(title, "title cannot be null"); Validate.notNull(siteKey, "siteKey cannot be null"); - Validate.notNull(baseIri, "base IRI cannot be null"); Validate.notNull(badges, "list of badges cannot be null"); this.title = title; this.siteKey = siteKey; - this.baseIri = baseIri; this.badges = badges; } @Override - public String getArticleTitle() { + public String getPageTitle() { return title; } @@ -76,16 +73,6 @@ public String getSiteKey() { return siteKey; } - @Override - public String getUrl() { - try { - return baseIri.concat(URLEncoder.encode(title, "utf-8")); - } catch (UnsupportedEncodingException e) { - throw new RuntimeException( - "Your JRE does not support UTF-8 encoding. Srsly?!", e); - } - } - @Override public List getBadges() { return Collections.unmodifiableList(badges); @@ -100,7 +87,6 @@ public List getBadges() { public int hashCode() { final int prime = 31; int result = 1; - result = prime * result + baseIri.hashCode(); result = prime * result + badges.hashCode(); result = prime * result + siteKey.hashCode(); result = prime * result + title.hashCode(); @@ -124,14 +110,13 @@ public boolean equals(Object obj) { return false; } SiteLinkImpl other = (SiteLinkImpl) obj; - return baseIri.equals(other.baseIri) && badges.equals(other.badges) - && siteKey.equals(other.siteKey) && title.equals(other.title); + return badges.equals(other.badges) && siteKey.equals(other.siteKey) + && title.equals(other.title); } - + @Override - public String toString(){ - return "SiteLink {title = " + this.baseIri + "/" + this.title - + ", siteKey = " + siteKey + public String toString() { + return "SiteLink {title = " + this.title + ", siteKey = " + siteKey + ", badges = " + this.badges + "}"; } } diff --git a/wdtk-datamodel/src/main/java/org/wikidata/wdtk/datamodel/implementation/SitesImpl.java b/wdtk-datamodel/src/main/java/org/wikidata/wdtk/datamodel/implementation/SitesImpl.java new file mode 100644 index 000000000..f00831ff1 --- /dev/null +++ b/wdtk-datamodel/src/main/java/org/wikidata/wdtk/datamodel/implementation/SitesImpl.java @@ -0,0 +1,203 @@ +package org.wikidata.wdtk.datamodel.implementation; + +/* + * #%L + * Wikidata Toolkit Data Model + * %% + * Copyright (C) 2014 Wikidata Toolkit Developers + * %% + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * #L% + */ + +import java.io.UnsupportedEncodingException; +import java.net.URLEncoder; +import java.util.HashMap; + +import org.apache.commons.lang3.Validate; +import org.wikidata.wdtk.datamodel.interfaces.SiteLink; +import org.wikidata.wdtk.datamodel.interfaces.Sites; + +/** + * Implementation of the {@link Sites} interface that allows sites to be + * registered. Objects of this type are not immutable, since they are not data + * objects, but the {@link Sites} interface only supports read access. + * + * @author Markus Kroetzsch + * + */ +public class SitesImpl implements Sites { + + /** + * Simple record for holding information about a site. + * + * @author Markus Kroetzsch + * + */ + class SiteInformation { + final String siteKey; + final String group; + final String languageCode; + final String siteType; + final String filePathPre; + final String filePathPost; + final String pagePathPre; + final String pagePathPost; + + SiteInformation(String siteKey, String group, String languageCode, + String siteType, String filePath, String pagePath) { + // Null might be acceptable for some of the following; but this + // should only be changed when we have a case where this is correct. + Validate.notNull(siteKey, "Site key must not be null."); + Validate.notNull(group, "Group must not be null."); + Validate.notNull(languageCode, "Language code must not be null."); + Validate.notNull(siteType, "Site type must not be null."); + Validate.notNull(filePath, "File path must not be null."); + Validate.notNull(pagePath, "Page path must not be null."); + + this.siteKey = siteKey; + this.group = group; + this.languageCode = languageCode; + this.siteType = siteType; + + int iFileName = filePath.indexOf("$1"); + this.filePathPre = filePath.substring(0, iFileName); + this.filePathPost = filePath.substring(iFileName + 2, + filePath.length()); + + int iPageName = pagePath.indexOf("$1"); + this.pagePathPre = pagePath.substring(0, iPageName); + this.pagePathPost = pagePath.substring(iPageName + 2, + pagePath.length()); + } + + /** + * Returns the file URL. + * + * @see Sites#getFileUrl(String, String) + * @param fileName + * the file name + * @return the file URL + */ + String getFileUrl(String fileName) { + return this.filePathPre + fileName + this.filePathPost; + } + + /** + * Returns the page URL. The method replaces spaces by underscores in + * page titles on MediaWiki sites, since this is how MediaWiki page URLs + * are constructed. For other sites, this might not be the case and + * spaces will just be escaped in the standard way using "+". + * + * @see Sites#getPageUrl(String, String) + * @param pageTitle + * the page title, not escaped + * @return the page URL + */ + String getPageUrl(String pageTitle) { + try { + String encodedTitle; + if ("mediawiki".equals(this.siteType)) { + encodedTitle = URLEncoder.encode( + pageTitle.replace(" ", "_"), "utf-8"); + // Keep special title symbols unescaped: + encodedTitle = encodedTitle.replace("%3A", ":").replace( + "%2F", "/"); + } else { + encodedTitle = URLEncoder.encode(pageTitle, "utf-8"); + } + return this.pagePathPre + encodedTitle + this.pagePathPost; + } catch (UnsupportedEncodingException e) { + throw new RuntimeException( + "Your JRE does not support UTF-8 encoding. Srsly?!", e); + } + } + } + + final HashMap sites = new HashMap(); + + /** + * Sets the stored information for the site of the given key to the given + * values. + * + * @param siteKey + * the global site key + * @param group + * the site group + * @param languageCode + * the site MediaWiki language code + * @param siteType + * the site type + * @param filePath + * the file path with $1 as a placeholder for the file name + * @param pagePath + * the page path with $1 as a placeholder for the page title + */ + public void setSiteInformation(String siteKey, String group, + String languageCode, String siteType, String filePath, + String pagePath) { + this.sites.put(siteKey, new SiteInformation(siteKey, group, + languageCode, siteType, filePath, pagePath)); + } + + @Override + public String getLanguageCode(String siteKey) { + if (this.sites.containsKey(siteKey)) { + return this.sites.get(siteKey).languageCode; + } else { + return null; + } + } + + @Override + public String getGroup(String siteKey) { + if (this.sites.containsKey(siteKey)) { + return this.sites.get(siteKey).group; + } else { + return null; + } + } + + @Override + public String getPageUrl(String siteKey, String pageTitle) { + if (this.sites.containsKey(siteKey)) { + return this.sites.get(siteKey).getPageUrl(pageTitle); + } else { + return null; + } + } + + @Override + public String getSiteLinkUrl(SiteLink siteLink) { + return this.getPageUrl(siteLink.getSiteKey(), siteLink.getPageTitle()); + } + + @Override + public String getFileUrl(String siteKey, String fileName) { + if (this.sites.containsKey(siteKey)) { + return this.sites.get(siteKey).getFileUrl(fileName); + } else { + return null; + } + } + + @Override + public String getSiteType(String siteKey) { + if (this.sites.containsKey(siteKey)) { + return this.sites.get(siteKey).siteType; + } else { + return null; + } + } + +} diff --git a/wdtk-datamodel/src/main/java/org/wikidata/wdtk/datamodel/interfaces/DataObjectFactory.java b/wdtk-datamodel/src/main/java/org/wikidata/wdtk/datamodel/interfaces/DataObjectFactory.java index 8aef0dd0b..fbedd8a62 100644 --- a/wdtk-datamodel/src/main/java/org/wikidata/wdtk/datamodel/interfaces/DataObjectFactory.java +++ b/wdtk-datamodel/src/main/java/org/wikidata/wdtk/datamodel/interfaces/DataObjectFactory.java @@ -248,18 +248,15 @@ Statement getStatement(Claim claim, List references, * Creates a {@link SiteLink}. * * @param title - * the title string of the linked article + * the title string of the linked page, including namespace + * prefixes if any * @param siteKey * the string key of the site of the linked article - * @param baseIri - * the string key of the site of the linked article; this might - * be computed from the site key in the future * @param badges * the list of badges of the linked article * @return a {@link SiteLink} corresponding to the input */ - SiteLink getSiteLink(String title, String siteKey, String baseIri, - List badges); + SiteLink getSiteLink(String title, String siteKey, List badges); /** * Creates a {@link PropertyDocument}. diff --git a/wdtk-datamodel/src/main/java/org/wikidata/wdtk/datamodel/interfaces/SiteLink.java b/wdtk-datamodel/src/main/java/org/wikidata/wdtk/datamodel/interfaces/SiteLink.java index 77d7b36b1..3331939ee 100644 --- a/wdtk-datamodel/src/main/java/org/wikidata/wdtk/datamodel/interfaces/SiteLink.java +++ b/wdtk-datamodel/src/main/java/org/wikidata/wdtk/datamodel/interfaces/SiteLink.java @@ -27,6 +27,11 @@ * site, and a list of "badges" that this article holds. Badges are specific * tags used on Wikimedia project sites for some articles, most prominently for * "featured articles". + *

+ * In spite of its name, the site link does not specify a full URL that it links + * to. It only provides a page title and a site key that may be used to find a + * URL. To do this, the site links need to be resolved using a {@link Sites} + * object. * * @author Markus Kroetzsch * @@ -34,11 +39,11 @@ public interface SiteLink { /** - * Get the string title of the linked article. + * Get the string title of the linked page. * * @return */ - String getArticleTitle(); + String getPageTitle(); /** * Get the string key of the linked site. @@ -47,13 +52,6 @@ public interface SiteLink { */ String getSiteKey(); - /** - * Get the full IRI (URL) of the linked article. - * - * @return - */ - String getUrl(); - /** * Get the list of badges of the linked article. * diff --git a/wdtk-datamodel/src/main/java/org/wikidata/wdtk/datamodel/interfaces/Sites.java b/wdtk-datamodel/src/main/java/org/wikidata/wdtk/datamodel/interfaces/Sites.java new file mode 100644 index 000000000..fc9078c12 --- /dev/null +++ b/wdtk-datamodel/src/main/java/org/wikidata/wdtk/datamodel/interfaces/Sites.java @@ -0,0 +1,119 @@ +package org.wikidata.wdtk.datamodel.interfaces; + +/* + * #%L + * Wikidata Toolkit Data Model + * %% + * Copyright (C) 2014 Wikidata Toolkit Developers + * %% + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * #L% + */ + +/** + * Registry to manage the association between site keys (such as "enwiki") and + * base URLs (such as "http://en.wikipedia.org/wiki/") that is needed to + * interpret {@link SiteLink} objects. These associations are part of the + * configuration of a MediaWiki site and therefore not fixed. + *

+ * This is not a Wikibase data object as such, but part of the general + * configuration of a Wikibase site. + * + * @author Markus Kroetzsch + * + */ +public interface Sites { + + /** + * Returns the MediaWiki language code for the given site, or null if there + * is no such data for this site key. + *

+ * The language code follows the MediaWiki conventions for language codes, + * which do not follow any standard. Most codes agree with those in BCP 47 , but there + * are a number of exceptions. + * + * @param siteKey + * the global site key + * @return the corresponding MediaWiki language code, or null if not known + */ + String getLanguageCode(String siteKey); + + /** + * Returns the group for the given site, or null if there is no such data + * for this site key. The group is a string identifier used for + * configuration purposes. Typical groups on Wikimedia sites include + * "wikipedia", "wikisource", "wikivoyage", and "wikiquote", used for most + * sites of these projects, but also singleton groups like "commons" and + * "wikimania2013". + * + * @param siteKey + * the global site key + * @return the corresponding group, or null if not known + */ + String getGroup(String siteKey); + + /** + * Returns the URL for the page of the given name, or null if the site is + * not known. All characters in the page title will be escaped for use in + * URLs. + * + * @param siteKey + * the global site key + * @param pageTitle + * the title of the page, including namespace prefixes if any + * @return the URL to link to this page on the given site, or null if not + * known + */ + String getPageUrl(String siteKey, String pageTitle); + + /** + * Returns the URL for the given site link, or null if its site key is not + * known. + * + * @param siteLink + * the SiteLink object + * @return the page URL for this site link, or null if not known + */ + String getSiteLinkUrl(SiteLink siteLink); + + /** + * Returns the URL for the file of the given name, or null if the site is + * not known. The file name is not escaped for use in URLs, so that + * one can use this method to construct URLs with parameters, e.g., when + * calling the API of the site. Also note that this method does not + * construct URLs for files uploaded to a MediaWiki site using the given + * file name; such files are usually placed in some subdirectory. + * + * @param siteKey + * the global site key + * @param fileName + * the name of the file + * @return the URL to link to this page on the given site, or null if not + * known + */ + String getFileUrl(String siteKey, String fileName); + + /** + * Returns the type for the given site, or null if there is no such data for + * this site key. For MediaWiki sites, this is "mediawiki". + * + * @param siteKey + * the global site key + * @return the corresponding type, or null if not known + */ + String getSiteType(String siteKey); + +} diff --git a/wdtk-datamodel/src/main/java/org/wikidata/wdtk/datamodel/json/JsonConverter.java b/wdtk-datamodel/src/main/java/org/wikidata/wdtk/datamodel/json/JsonConverter.java index f7cac5ddb..ac668ba09 100644 --- a/wdtk-datamodel/src/main/java/org/wikidata/wdtk/datamodel/json/JsonConverter.java +++ b/wdtk-datamodel/src/main/java/org/wikidata/wdtk/datamodel/json/JsonConverter.java @@ -321,7 +321,7 @@ JSONArray convertStatementGroupToJson(StatementGroup statementGroup) { JSONObject getJsonForSiteLink(SiteLink link) { JSONObject result = new JSONObject(); result.put("site", link.getSiteKey()); - result.put(JsonConstants.KEY_TITLE, link.getArticleTitle()); + result.put(JsonConstants.KEY_TITLE, link.getPageTitle()); result.put("badges", new JSONArray()); // always empty at the moment return result; } diff --git a/wdtk-datamodel/src/test/java/org/wikidata/wdtk/datamodel/implementation/DataObjectFactoryImplTest.java b/wdtk-datamodel/src/test/java/org/wikidata/wdtk/datamodel/implementation/DataObjectFactoryImplTest.java index 86f0f6103..9aed11ff1 100644 --- a/wdtk-datamodel/src/test/java/org/wikidata/wdtk/datamodel/implementation/DataObjectFactoryImplTest.java +++ b/wdtk-datamodel/src/test/java/org/wikidata/wdtk/datamodel/implementation/DataObjectFactoryImplTest.java @@ -226,9 +226,9 @@ public final void testGetStatementGroup() { @Test public final void testGetSiteLink() { SiteLink o1 = new SiteLinkImpl("SOLID", "enwiki", - "http://en.wikipedia.org", Collections. emptyList()); + Collections. emptyList()); SiteLink o2 = factory.getSiteLink("SOLID", "enwiki", - "http://en.wikipedia.org", Collections. emptyList()); + Collections. emptyList()); assertEquals(o1, o2); } diff --git a/wdtk-datamodel/src/test/java/org/wikidata/wdtk/datamodel/implementation/ItemDocumentImplTest.java b/wdtk-datamodel/src/test/java/org/wikidata/wdtk/datamodel/implementation/ItemDocumentImplTest.java index 9f6733d53..b5fecd6a4 100644 --- a/wdtk-datamodel/src/test/java/org/wikidata/wdtk/datamodel/implementation/ItemDocumentImplTest.java +++ b/wdtk-datamodel/src/test/java/org/wikidata/wdtk/datamodel/implementation/ItemDocumentImplTest.java @@ -66,8 +66,8 @@ public void setUp() throws Exception { StatementRank.NORMAL, "MyId"); StatementGroup sg = new StatementGroupImpl(Collections.singletonList(s)); statementGroups = Collections.singletonList(sg); + SiteLink sl = new SiteLinkImpl("Douglas Adams", "enwiki", - "http://en.wikipedia.org/wiki/", Collections. emptyList()); sitelinks = Collections.singletonMap("enwiki", sl); diff --git a/wdtk-datamodel/src/test/java/org/wikidata/wdtk/datamodel/implementation/SiteLinkImplTest.java b/wdtk-datamodel/src/test/java/org/wikidata/wdtk/datamodel/implementation/SiteLinkImplTest.java index 0dc0787a2..4759d30fb 100644 --- a/wdtk-datamodel/src/test/java/org/wikidata/wdtk/datamodel/implementation/SiteLinkImplTest.java +++ b/wdtk-datamodel/src/test/java/org/wikidata/wdtk/datamodel/implementation/SiteLinkImplTest.java @@ -40,41 +40,32 @@ public class SiteLinkImplTest { @Before public void setUp() throws Exception { s1 = new SiteLinkImpl("Dresden", "enwiki", - "http://en.wikipedia.org/wiki/", Collections. emptyList()); s2 = new SiteLinkImpl("Dresden", "enwiki", - "http://en.wikipedia.org/wiki/", Collections. emptyList()); } @Test public void fieldsIsCorrect() { - assertEquals(s1.getArticleTitle(), "Dresden"); + assertEquals(s1.getPageTitle(), "Dresden"); assertEquals(s1.getSiteKey(), "enwiki"); assertEquals(s1.getBadges(), Collections. emptyList()); } @Test public void equalityBasedOnContent() { - SiteLink s3 = new SiteLinkImpl("Berlin", "enwiki", - "http://en.wikipedia.org/wiki/", + SiteLink sDiffTitle = new SiteLinkImpl("Berlin", "enwiki", Collections. emptyList()); - SiteLink s4 = new SiteLinkImpl("Dresden", "dewiki", - "http://en.wikipedia.org/wiki/", + SiteLink sDiffSiteKey = new SiteLinkImpl("Dresden", "dewiki", Collections. emptyList()); - SiteLink s5 = new SiteLinkImpl("Dresden", "enwiki", - "http://de.wikipedia.org/wiki/", - Collections. emptyList()); - SiteLink s6 = new SiteLinkImpl("Dresden", "enwiki", - "http://en.wikipedia.org/wiki/", + SiteLink sDiffBadges = new SiteLinkImpl("Dresden", "enwiki", Collections. singletonList("some badge?")); assertEquals(s1, s1); assertEquals(s1, s2); - assertThat(s1, not(equalTo(s3))); - assertThat(s1, not(equalTo(s4))); - assertThat(s1, not(equalTo(s5))); - assertThat(s1, not(equalTo(s6))); + assertThat(s1, not(equalTo(sDiffTitle))); + assertThat(s1, not(equalTo(sDiffSiteKey))); + assertThat(s1, not(equalTo(sDiffBadges))); assertThat(s1, not(equalTo(null))); assertFalse(s1.equals(this)); } @@ -84,43 +75,19 @@ public void hashBasedOnContent() { assertEquals(s1.hashCode(), s2.hashCode()); } - @Test - public void siteLinkIri() { - assertEquals(s1.getUrl(), "http://en.wikipedia.org/wiki/Dresden"); - - SiteLink sSpecialChar = new SiteLinkImpl("&", "dewiki", - "http://de.wikipedia.org/wiki/", - Collections. emptyList()); - assertEquals(sSpecialChar.getUrl(), "http://de.wikipedia.org/wiki/%26"); - SiteLink sSpecialChar2 = new SiteLinkImpl("Björk", "dewiki", - "http://de.wikipedia.org/wiki/", - Collections. emptyList()); - assertEquals(sSpecialChar2.getUrl(), - "http://de.wikipedia.org/wiki/Bj%C3%B6rk"); - } - @Test(expected = NullPointerException.class) public void titleNotNull() { - new SiteLinkImpl(null, "enwiki", "http://en.wikipedia.org/wiki/", - Collections. emptyList()); + new SiteLinkImpl(null, "enwiki", Collections. emptyList()); } @Test(expected = NullPointerException.class) public void siteKeyNotNull() { - new SiteLinkImpl("Dresden", null, "http://en.wikipedia.org/wiki/", - Collections. emptyList()); - } - - @Test(expected = NullPointerException.class) - public void baseIriNotNull() { - new SiteLinkImpl("Dresden", "enwiki", null, - Collections. emptyList()); + new SiteLinkImpl("Dresden", null, Collections. emptyList()); } @Test(expected = NullPointerException.class) public void badgesNotNull() { - new SiteLinkImpl("Dresden", "enwiki", "http://en.wikipedia.org/wiki/", - null); + new SiteLinkImpl("Dresden", "enwiki", null); } } diff --git a/wdtk-datamodel/src/test/java/org/wikidata/wdtk/datamodel/implementation/SitesImplTest.java b/wdtk-datamodel/src/test/java/org/wikidata/wdtk/datamodel/implementation/SitesImplTest.java new file mode 100644 index 000000000..48f0bb240 --- /dev/null +++ b/wdtk-datamodel/src/test/java/org/wikidata/wdtk/datamodel/implementation/SitesImplTest.java @@ -0,0 +1,85 @@ +package org.wikidata.wdtk.datamodel.implementation; + +/* + * #%L + * Wikidata Toolkit Data Model + * %% + * Copyright (C) 2014 Wikidata Toolkit Developers + * %% + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * #L% + */ + +import static org.junit.Assert.assertEquals; + +import java.util.Collections; + +import org.junit.Before; +import org.junit.Test; +import org.wikidata.wdtk.datamodel.interfaces.SiteLink; + +public class SitesImplTest { + + SitesImpl sites; + + @Before + public void setUp() throws Exception { + this.sites = new SitesImpl(); + this.sites.setSiteInformation("enwiki", "wikipedia", "en", "mediawiki", + "http://en.wikipedia.org/w/$1", + "http://en.wikipedia.org/wiki/$1"); + this.sites.setSiteInformation("dewiki", "wikipedia", "de", "mediawiki", + "http://de.wikipedia.org/w/$1", + "http://de.wikipedia.org/wiki/$1"); + this.sites.setSiteInformation("somesite", "group", "language", + "something else", "http://example.org/file/$1", + "http://example.org/page/$1"); + } + + @Test + public void siteLinkIri() { + SiteLink sSpecialChar = new SiteLinkImpl("&", "dewiki", + Collections. emptyList()); + assertEquals(this.sites.getSiteLinkUrl(sSpecialChar), + "http://de.wikipedia.org/wiki/%26"); + + SiteLink sSpecialChar2 = new SiteLinkImpl("Björk", "enwiki", + Collections. emptyList()); + assertEquals(this.sites.getSiteLinkUrl(sSpecialChar2), + "http://en.wikipedia.org/wiki/Bj%C3%B6rk"); + } + + @Test + public void unknownSiteKey() { + assertEquals(this.sites.getGroup("somekey"), null); + assertEquals(this.sites.getSiteType("somekey"), null); + assertEquals(this.sites.getLanguageCode("somekey"), null); + assertEquals(this.sites.getFileUrl("somekey", "filename"), null); + assertEquals(this.sites.getPageUrl("somekey", "page name"), null); + } + + @Test + public void knownSiteKey() { + assertEquals(this.sites.getGroup("enwiki"), "wikipedia"); + assertEquals(this.sites.getSiteType("enwiki"), "mediawiki"); + assertEquals(this.sites.getLanguageCode("enwiki"), "en"); + assertEquals(this.sites.getFileUrl("enwiki", "filename"), + "http://en.wikipedia.org/w/filename"); + assertEquals(this.sites.getPageUrl("enwiki", "Page name"), + "http://en.wikipedia.org/wiki/Page_name"); + + assertEquals(this.sites.getPageUrl("somesite", "Page name"), + "http://example.org/page/Page+name"); + } + +} diff --git a/wdtk-datamodel/src/test/java/org/wikidata/wdtk/datamodel/json/JsonConverterTest.java b/wdtk-datamodel/src/test/java/org/wikidata/wdtk/datamodel/json/JsonConverterTest.java index ea8509e9b..3f26e3315 100644 --- a/wdtk-datamodel/src/test/java/org/wikidata/wdtk/datamodel/json/JsonConverterTest.java +++ b/wdtk-datamodel/src/test/java/org/wikidata/wdtk/datamodel/json/JsonConverterTest.java @@ -41,7 +41,6 @@ import org.wikidata.wdtk.datamodel.interfaces.StatementGroup; import org.wikidata.wdtk.datamodel.interfaces.StatementRank; import org.wikidata.wdtk.datamodel.interfaces.ValueSnak; -import org.wikidata.wdtk.datamodel.json.JsonConverter; import org.wikidata.wdtk.testing.MockStringContentFactory; public class JsonConverterTest { @@ -53,7 +52,7 @@ public class JsonConverterTest { final static String FILE_NAME_REFERENCE = "Reference.json"; final static String JSON_EMPTY_PROPERTY_DOCUMENT = "{\"id\":\"P42\",\"title\":\"P42\",\"type\":\"property\"}"; - final static String JSON_SITE_LINK = "{\"site\":\"siteKey\",\"badges\":[],\"title\":\"title\"}"; + final static String JSON_SITE_LINK = "{\"site\":\"enwiki\",\"badges\":[],\"title\":\"title\"}"; final DataObjectFactory dataObjectFactory = new DataObjectFactoryImpl(); final TestObjectFactory testObjectFactory = new TestObjectFactory(); @@ -161,7 +160,6 @@ public void testClaim() throws JSONException, IOException { @Test public void testReference() throws JSONException, IOException { Reference ref = testObjectFactory.createReference(); - System.out.println(jsonConverter.getJsonForReference(ref)); JsonResultComparer.compareJSONObjects( jsonConverter.getJsonForReference(ref), getResourceFromFile(FILE_NAME_REFERENCE)); @@ -187,11 +185,10 @@ public void testStatementGroup() throws JSONException, IOException { @Test public void testSiteLinks() { - SiteLink siteLink = dataObjectFactory.getSiteLink("title", "siteKey", - "baseIri", Collections. emptyList()); + SiteLink siteLink = dataObjectFactory.getSiteLink("title", "enwiki", + Collections. emptyList()); JsonResultComparer.compareJSONObjects(jsonConverter .getJsonForSiteLink(siteLink), new JSONObject(JSON_SITE_LINK)); - } } diff --git a/wdtk-datamodel/src/test/java/org/wikidata/wdtk/datamodel/json/TestObjectFactory.java b/wdtk-datamodel/src/test/java/org/wikidata/wdtk/datamodel/json/TestObjectFactory.java index 922335c98..9608d6c06 100644 --- a/wdtk-datamodel/src/test/java/org/wikidata/wdtk/datamodel/json/TestObjectFactory.java +++ b/wdtk-datamodel/src/test/java/org/wikidata/wdtk/datamodel/json/TestObjectFactory.java @@ -221,29 +221,24 @@ public List createDescriptions() { /** * Creates a map of {@link SiteLink}s with empty badges. - * *

* ID = SLs - *

- * *

* Default values *

*
    - *
  • SiteLink: name = "enwiki" title = "title_en", siteKey = "siteKey", - * baseIri = "test"
  • - *
  • SiteLink: name = "auwiki" title = "title_au", siteKey = "siteKey" , - * baseIri = "test"
  • + *
  • "enwiki" => SiteLink: title = "title_en", siteKey = "enwiki"
  • + *
  • "dewiki" => SiteLink: title = "title_de", siteKey = "dewiki"
  • *
* - * @return Map for {@link SiteLink}s and there titles + * @return Map for {@link SiteLink}s and their titles */ public Map createSiteLinks() { Map result = new HashMap(); - result.put("enwiki", factory.getSiteLink("title_en", "siteKey", - baseIri, new LinkedList())); - result.put("auwiki", factory.getSiteLink("title_au", "siteKey", - baseIri, new LinkedList())); + result.put("enwiki", factory.getSiteLink("title_en", "enwiki", + new LinkedList())); + result.put("dewiki", factory.getSiteLink("title_de", "dewiki", + new LinkedList())); return result; } diff --git a/wdtk-datamodel/src/test/resources/ItemDocument.json b/wdtk-datamodel/src/test/resources/ItemDocument.json index 36fc324c2..b07808298 100644 --- a/wdtk-datamodel/src/test/resources/ItemDocument.json +++ b/wdtk-datamodel/src/test/resources/ItemDocument.json @@ -1 +1 @@ -{"id":"Q10","claims":{"P11":[{"id":"none","rank":"normal","mainsnak":{"property":"P11","snaktype":"novalue"},"type":"statement"}],"P1040":[{"id":"none2","references":[{"snak-order":["P112"],"snaks":{"P112":[{"property":"P112","snaktype":"value","datavalue":{"value":{"calendarmodel":"http://www.wikidata.org/entity/Q1985727","after":43,"timezone":0,"time":"+00000000306-11-03T13:07:06Z","precision":32,"before":17},"type":"time"}}]}}],"rank":"normal","mainsnak":{"property":"P1040","snaktype":"value","datavalue":{"value":{"calendarmodel":"http://www.wikidata.org/entity/Q1985727","after":43,"timezone":0,"time":"+00000000306-11-03T13:07:06Z","precision":32,"before":17},"type":"time"}},"qualifiers":{"P15":[{"property":"P15","snaktype":"value","datavalue":{"value":{"calendarmodel":"http://www.wikidata.org/entity/Q1985727","after":43,"timezone":0,"time":"+00000000306-11-03T13:07:06Z","precision":32,"before":17},"type":"time"}}]},"type":"statement"},{"id":"none","rank":"normal","mainsnak":{"property":"P1040","snaktype":"value","datavalue":{"value":"TestString","type":"string"}},"type":"statement"}]},"title":"Q10","labels":{"lc":{"value":"foo","language":"lc"},"lc2":{"value":"bar","language":"lc2"}},"type":"item","aliases":{"lc":[{"value":"foo","language":"lc"},{"value":"bar","language":"lc"}]},"descriptions":{"lc":{"value":"it's foo","language":"lc"},"lc2":{"value":"it's bar","language":"lc2"}},"sitelinks":{"auwiki":{"site":"siteKey","badges":[],"title":"title_au"},"enwiki":{"site":"siteKey","badges":[],"title":"title_en"}}} \ No newline at end of file +{"id":"Q10","claims":{"P11":[{"id":"none","rank":"normal","mainsnak":{"property":"P11","snaktype":"novalue"},"type":"statement"}],"P1040":[{"id":"none2","references":[{"snak-order":["P112"],"snaks":{"P112":[{"property":"P112","snaktype":"value","datavalue":{"value":{"calendarmodel":"http://www.wikidata.org/entity/Q1985727","after":43,"timezone":0,"time":"+00000000306-11-03T13:07:06Z","precision":32,"before":17},"type":"time"}}]}}],"rank":"normal","mainsnak":{"property":"P1040","snaktype":"value","datavalue":{"value":{"calendarmodel":"http://www.wikidata.org/entity/Q1985727","after":43,"timezone":0,"time":"+00000000306-11-03T13:07:06Z","precision":32,"before":17},"type":"time"}},"qualifiers":{"P15":[{"property":"P15","snaktype":"value","datavalue":{"value":{"calendarmodel":"http://www.wikidata.org/entity/Q1985727","after":43,"timezone":0,"time":"+00000000306-11-03T13:07:06Z","precision":32,"before":17},"type":"time"}}]},"type":"statement"},{"id":"none","rank":"normal","mainsnak":{"property":"P1040","snaktype":"value","datavalue":{"value":"TestString","type":"string"}},"type":"statement"}]},"title":"Q10","labels":{"lc":{"value":"foo","language":"lc"},"lc2":{"value":"bar","language":"lc2"}},"type":"item","aliases":{"lc":[{"value":"foo","language":"lc"},{"value":"bar","language":"lc"}]},"descriptions":{"lc":{"value":"it's foo","language":"lc"},"lc2":{"value":"it's bar","language":"lc2"}},"sitelinks":{"dewiki":{"site":"dewiki","badges":[],"title":"title_de"},"enwiki":{"site":"enwiki","badges":[],"title":"title_en"}}} \ No newline at end of file diff --git a/wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/DumpContentType.java b/wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/DumpContentType.java index a0cc59193..43f8c9f8c 100644 --- a/wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/DumpContentType.java +++ b/wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/DumpContentType.java @@ -28,5 +28,5 @@ * */ public enum DumpContentType { - DAILY, CURRENT, FULL + DAILY, CURRENT, FULL, SITES } \ No newline at end of file diff --git a/wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/JsonConverter.java b/wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/JsonConverter.java index 890eb7fc5..6487ac077 100644 --- a/wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/JsonConverter.java +++ b/wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/JsonConverter.java @@ -82,7 +82,7 @@ public class JsonConverter { private static final String KEY_LINK = "links"; private final DataObjectFactory factory; - private String baseIri = ""; + private final String baseIri; private final MonolingualTextValueHandler mltvHandler; private final StatementGroupBuilder statementGroupBuilder; @@ -99,7 +99,10 @@ public class JsonConverter { * data model */ public JsonConverter(String baseIri, DataObjectFactory factory) { - this.setBaseIri(baseIri); + Validate.notNull(baseIri); + Validate.notNull(factory); + + this.baseIri = baseIri; this.factory = factory; mltvHandler = new MonolingualTextValueHandler(this.factory); @@ -348,9 +351,6 @@ private Map getSiteLinks(JSONObject jsonObject) Map result = new HashMap(); - // FIXME we need to get the proper IRI instead - String siteIri = ""; - // json.org does not type its Iterator: unchecked cast needed @SuppressWarnings("unchecked") Iterator linkIterator = jsonObject.keys(); @@ -376,8 +376,7 @@ private Map getSiteLinks(JSONObject jsonObject) } // create the SiteLink instance - SiteLink siteLink = factory.getSiteLink(title, siteKey, siteIri, - badges); + SiteLink siteLink = factory.getSiteLink(title, siteKey, badges); result.put(siteKey, siteLink); } @@ -404,30 +403,28 @@ private List getStatementGroups(JSONArray jsonStatements, List statementsFromJson = new ArrayList( jsonStatements.length()); - // iterate over all the statements in the item and decompose them + // Get the (flat) list of all Statements from JSON: for (int i = 0; i < jsonStatements.length(); i++) { - - // snak conversion might fail - // so gracefully skip them and log a debug message JSONObject statementJson = jsonStatements.getJSONObject(i); - try { // only conversion exceptions are to be caught + + // Skip only current Statement in case of errors, but log error + try { Statement statement = this.getStatement(statementJson, entityIdValue); statementsFromJson.add(statement); } catch (IllegalArgumentException e) { - logger.debug("Encountered an illegal argument exception during statement parsing:\n" + logger.error("IllegalArgumentException during statement parsing:\n" + e.getMessage() - + "\nIn statement\n" + + "\nIn statement: \n" + statementJson.toString(2)); } catch (JSONException e) { - logger.debug("Encountered a JSON exception during statement parsing:\n" - + e.getMessage() - + "\nIn statement\n" + logger.error("JSONException during statement parsing:\n" + + e.getMessage() + "\nIn statement: \n" + statementJson.toString(2)); } } - // process the list of statements into a list of statement groups + // Process the list of statements into a list of statement groups: result = this.statementGroupBuilder .buildFromStatementList(statementsFromJson); @@ -936,17 +933,4 @@ public String getBaseIri() { return baseIri; } - /** - * For the baseIri see also - * {@link org.wikidata.wdtk.datamodel.interfaces.ItemId} - * - * @param baseIri - * the new baseIRI to be set. If the given string is null, - * nothing will be done. - */ - public void setBaseIri(String baseIri) { - Validate.notNull(baseIri); - this.baseIri = baseIri; - } - } diff --git a/wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/MwDumpFile.java b/wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/MwDumpFile.java index f4e6c2f04..a1907016d 100644 --- a/wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/MwDumpFile.java +++ b/wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/MwDumpFile.java @@ -96,7 +96,7 @@ public int compare(MwDumpFile a, MwDumpFile b) { public Long getMaximalRevisionId(); /** - * Returns an input stream that provides access to the (uncompressed) XML + * Returns an input stream that provides access to the (uncompressed) text * content of the dump file. *

* It is important to close the stream after use. @@ -108,7 +108,7 @@ public int compare(MwDumpFile a, MwDumpFile b) { public InputStream getDumpFileStream() throws IOException; /** - * Returns a buffered reader that provides access to the (uncompressed) XML + * Returns a buffered reader that provides access to the (uncompressed) text * content of the dump file. *

* It is important to close the reader after use. diff --git a/wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/MwDumpFileProcessorImpl.java b/wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/MwRevisionDumpFileProcessor.java similarity index 82% rename from wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/MwDumpFileProcessorImpl.java rename to wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/MwRevisionDumpFileProcessor.java index 96c3d1ee8..89551b4cd 100644 --- a/wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/MwDumpFileProcessorImpl.java +++ b/wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/MwRevisionDumpFileProcessor.java @@ -33,20 +33,22 @@ import org.slf4j.LoggerFactory; /** - * This class processes MediaWiki dumpfiles, extracting all revisions and - * forwarding them to any registered revision processor. The class also keeps - * track of whether or not a certain article respectively revision has already - * been encountered. Therefore, no revision is processed twice and the - * registered revision processors can be informed whether the revision is the - * first of the given article or not. The first revision of an article that is - * encountered in a MediaWiki dump file is usually the most recent one. If - * multiple dump files are processed in reverse chronological order, the first - * revision that is encountered is also the most recent one overall. + * This class processes MediaWiki dumpfiles that contain lists of page revisions + * in the specific XML format used by MediaWiki for exporting pages. It extracts + * all revisions and forwards them to any registered revision processor. The + * class also keeps track of whether or not a certain article respectively + * revision has already been encountered. Therefore, no revision is processed + * twice and the registered revision processors can be informed whether the + * revision is the first of the given article or not. The first revision of an + * article that is encountered in a MediaWiki dump file is usually the most + * recent one. If multiple dump files are processed in reverse chronological + * order, the first revision that is encountered is also the most recent one + * overall. * * @author Markus Kroetzsch * */ -public class MwDumpFileProcessorImpl implements MwDumpFileProcessor { +public class MwRevisionDumpFileProcessor implements MwDumpFileProcessor { static final String E_MEDIAWIKI = "mediawiki"; static final String E_SITEINFO = "siteinfo"; @@ -78,7 +80,7 @@ public class MwDumpFileProcessorImpl implements MwDumpFileProcessor { static final String E_CONTRIBUTOR_IP = "ip"; static final Logger logger = LoggerFactory - .getLogger(MwDumpFileProcessorImpl.class); + .getLogger(MwRevisionDumpFileProcessor.class); XMLInputFactory xmlFactory; XMLStreamReader xmlReader; @@ -114,7 +116,7 @@ public class MwDumpFileProcessorImpl implements MwDumpFileProcessor { * @param mwRevisionProcessor * the revision processor to which all revisions will be reported */ - public MwDumpFileProcessorImpl(MwRevisionProcessor mwRevisionProcessor) { + public MwRevisionDumpFileProcessor(MwRevisionProcessor mwRevisionProcessor) { this.xmlFactory = XMLInputFactory.newInstance(); this.namespaces = new HashMap(); this.mwRevision = new MwRevisionImpl(); @@ -150,7 +152,7 @@ public void processDumpFileContents(InputStream inputStream, this.xmlReader = this.xmlFactory.createXMLStreamReader(inputStream); processXmlMediawiki(); } catch (XMLStreamException | MwDumpFormatException e) { - MwDumpFileProcessorImpl.logger.error(e.toString()); + MwRevisionDumpFileProcessor.logger.error(e.toString()); } finally { // unfortunately, xmlReader does not implement AutoClosable if (this.xmlReader != null) { try { @@ -187,14 +189,14 @@ void processXmlMediawiki() throws XMLStreamException, MwDumpFormatException { case XMLStreamConstants.START_ELEMENT: switch (this.xmlReader.getLocalName()) { - case MwDumpFileProcessorImpl.E_MEDIAWIKI: + case MwRevisionDumpFileProcessor.E_MEDIAWIKI: break; - case MwDumpFileProcessorImpl.E_SITEINFO: + case MwRevisionDumpFileProcessor.E_SITEINFO: processXmlSiteinfo(); this.mwRevisionProcessor.startRevisionProcessing( this.sitename, this.baseUrl, this.namespaces); break; - case MwDumpFileProcessorImpl.E_PAGE: + case MwRevisionDumpFileProcessor.E_PAGE: tryProcessXmlPage(); break; } @@ -235,25 +237,25 @@ void processXmlSiteinfo() throws XMLStreamException { case XMLStreamConstants.START_ELEMENT: switch (xmlReader.getLocalName()) { - case MwDumpFileProcessorImpl.E_SITENAME: + case MwRevisionDumpFileProcessor.E_SITENAME: this.sitename = this.xmlReader.getElementText(); break; - case MwDumpFileProcessorImpl.E_NAMESPACE: + case MwRevisionDumpFileProcessor.E_NAMESPACE: Integer namespaceKey = new Integer( this.xmlReader.getAttributeValue(null, - MwDumpFileProcessorImpl.A_NSKEY)); + MwRevisionDumpFileProcessor.A_NSKEY)); this.namespaces.put(namespaceKey, this.xmlReader.getElementText()); break; - case MwDumpFileProcessorImpl.E_BASEURL: + case MwRevisionDumpFileProcessor.E_BASEURL: this.baseUrl = this.xmlReader.getElementText(); break; } break; case XMLStreamConstants.END_ELEMENT: - if (MwDumpFileProcessorImpl.E_SITEINFO.equals(this.xmlReader - .getLocalName())) { + if (MwRevisionDumpFileProcessor.E_SITEINFO + .equals(this.xmlReader.getLocalName())) { return; } break; @@ -277,7 +279,7 @@ void tryProcessXmlPage() throws XMLStreamException { try { processXmlPage(); } catch (MwDumpFormatException e) { - MwDumpFileProcessorImpl.logger + MwRevisionDumpFileProcessor.logger .error("Error when trying to process revision block for page \"" + this.mwRevision.getTitle() + "\" (namespace " @@ -287,17 +289,17 @@ void tryProcessXmlPage() throws XMLStreamException { + "): " + e.toString()); - MwDumpFileProcessorImpl.logger.info("Trying to recover ..."); + MwRevisionDumpFileProcessor.logger.info("Trying to recover ..."); while (this.xmlReader.hasNext()) { this.xmlReader.next(); if (this.xmlReader.getEventType() == XMLStreamConstants.END_ELEMENT - && this.xmlReader.getLocalName() == MwDumpFileProcessorImpl.E_PAGE) { - MwDumpFileProcessorImpl.logger + && this.xmlReader.getLocalName() == MwRevisionDumpFileProcessor.E_PAGE) { + MwRevisionDumpFileProcessor.logger .info("... recovery successful. Continuing processing."); return; } } - MwDumpFileProcessorImpl.logger + MwRevisionDumpFileProcessor.logger .error("Recovery failed. Could not process remaining XML."); } } @@ -326,21 +328,21 @@ void processXmlPage() throws XMLStreamException, MwDumpFormatException { case XMLStreamConstants.START_ELEMENT: switch (this.xmlReader.getLocalName()) { - case MwDumpFileProcessorImpl.E_PAGE_TITLE: + case MwRevisionDumpFileProcessor.E_PAGE_TITLE: this.mwRevision.title = this.xmlReader.getElementText(); break; - case MwDumpFileProcessorImpl.E_PAGE_NAMESPACE: + case MwRevisionDumpFileProcessor.E_PAGE_NAMESPACE: this.mwRevision.namespace = new Integer( this.xmlReader.getElementText()); break; - case MwDumpFileProcessorImpl.E_PAGE_ID: + case MwRevisionDumpFileProcessor.E_PAGE_ID: this.mwRevision.pageId = new Integer( this.xmlReader.getElementText()); break; - case MwDumpFileProcessorImpl.E_PAGE_REVISION: + case MwRevisionDumpFileProcessor.E_PAGE_REVISION: processXmlRevision(); break; - case MwDumpFileProcessorImpl.E_PAGE_REDIRECT: + case MwRevisionDumpFileProcessor.E_PAGE_REDIRECT: break; default: throw new MwDumpFormatException("Unexpected element \"" @@ -349,7 +351,7 @@ void processXmlPage() throws XMLStreamException, MwDumpFormatException { break; case XMLStreamConstants.END_ELEMENT: - if (MwDumpFileProcessorImpl.E_PAGE.equals(xmlReader + if (MwRevisionDumpFileProcessor.E_PAGE.equals(xmlReader .getLocalName())) { return; } @@ -384,31 +386,31 @@ void processXmlRevision() throws XMLStreamException, MwDumpFormatException { case XMLStreamConstants.START_ELEMENT: switch (this.xmlReader.getLocalName()) { - case MwDumpFileProcessorImpl.E_REV_COMMENT: + case MwRevisionDumpFileProcessor.E_REV_COMMENT: this.mwRevision.comment = this.xmlReader.getElementText(); break; - case MwDumpFileProcessorImpl.E_REV_TEXT: + case MwRevisionDumpFileProcessor.E_REV_TEXT: this.mwRevision.text = this.xmlReader.getElementText(); break; - case MwDumpFileProcessorImpl.E_REV_TIMESTAMP: + case MwRevisionDumpFileProcessor.E_REV_TIMESTAMP: this.mwRevision.timeStamp = this.xmlReader.getElementText(); break; - case MwDumpFileProcessorImpl.E_REV_FORMAT: + case MwRevisionDumpFileProcessor.E_REV_FORMAT: this.mwRevision.format = this.xmlReader.getElementText(); break; - case MwDumpFileProcessorImpl.E_REV_MODEL: + case MwRevisionDumpFileProcessor.E_REV_MODEL: this.mwRevision.model = this.xmlReader.getElementText(); break; - case MwDumpFileProcessorImpl.E_REV_CONTRIBUTOR: + case MwRevisionDumpFileProcessor.E_REV_CONTRIBUTOR: processXmlContributor(); break; - case MwDumpFileProcessorImpl.E_REV_ID: + case MwRevisionDumpFileProcessor.E_REV_ID: this.mwRevision.revisionId = new Long( this.xmlReader.getElementText()); break; - case MwDumpFileProcessorImpl.E_REV_PARENT_ID: - case MwDumpFileProcessorImpl.E_REV_SHA1: - case MwDumpFileProcessorImpl.E_REV_MINOR: + case MwRevisionDumpFileProcessor.E_REV_PARENT_ID: + case MwRevisionDumpFileProcessor.E_REV_SHA1: + case MwRevisionDumpFileProcessor.E_REV_MINOR: break; default: throw new MwDumpFormatException("Unexpected element \"" @@ -418,7 +420,7 @@ void processXmlRevision() throws XMLStreamException, MwDumpFormatException { break; case XMLStreamConstants.END_ELEMENT: - if (MwDumpFileProcessorImpl.E_PAGE_REVISION + if (MwRevisionDumpFileProcessor.E_PAGE_REVISION .equals(this.xmlReader.getLocalName())) { this.mwRevisionProcessor.processRevision(this.mwRevision); return; @@ -453,15 +455,15 @@ void processXmlContributor() throws XMLStreamException, case XMLStreamConstants.START_ELEMENT: switch (this.xmlReader.getLocalName()) { - case MwDumpFileProcessorImpl.E_CONTRIBUTOR_NAME: + case MwRevisionDumpFileProcessor.E_CONTRIBUTOR_NAME: this.mwRevision.contributor = this.xmlReader .getElementText(); break; - case MwDumpFileProcessorImpl.E_CONTRIBUTOR_ID: + case MwRevisionDumpFileProcessor.E_CONTRIBUTOR_ID: this.mwRevision.contributorId = new Integer( this.xmlReader.getElementText()); break; - case MwDumpFileProcessorImpl.E_CONTRIBUTOR_IP: + case MwRevisionDumpFileProcessor.E_CONTRIBUTOR_IP: this.mwRevision.contributor = this.xmlReader .getElementText(); this.mwRevision.contributorId = -1; @@ -475,7 +477,7 @@ void processXmlContributor() throws XMLStreamException, break; case XMLStreamConstants.END_ELEMENT: - if (MwDumpFileProcessorImpl.E_REV_CONTRIBUTOR + if (MwRevisionDumpFileProcessor.E_REV_CONTRIBUTOR .equals(this.xmlReader.getLocalName())) { return; } diff --git a/wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/MwSitesDumpFileProcessor.java b/wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/MwSitesDumpFileProcessor.java new file mode 100644 index 000000000..db3fdcd2a --- /dev/null +++ b/wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/MwSitesDumpFileProcessor.java @@ -0,0 +1,184 @@ +package org.wikidata.wdtk.dumpfiles; + +/* + * #%L + * Wikidata Toolkit Dump File Handling + * %% + * Copyright (C) 2014 Wikidata Toolkit Developers + * %% + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * #L% + */ + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.wikidata.wdtk.datamodel.implementation.SitesImpl; +import org.wikidata.wdtk.datamodel.interfaces.Sites; + +/** + * This class processes dump files that contain the SQL dump of the MediaWiki sites table. + * + * @author Markus Kroetzsch + * + */ +public class MwSitesDumpFileProcessor implements MwDumpFileProcessor { + + static final Logger logger = LoggerFactory + .getLogger(MwSitesDumpFileProcessor.class); + + final SitesImpl sites = new SitesImpl(); + + /** + * Returns the information about sites that has been extracted from the dump + * file(s) processed earlier. + * + * @return the sites information + */ + public Sites getSites() { + return this.sites; + } + + @Override + public void processDumpFileContents(InputStream inputStream, + MwDumpFile dumpFile) { + BufferedReader bufferedReader = new BufferedReader( + new InputStreamReader(inputStream)); + + try { + String line; + while ((line = bufferedReader.readLine()) != null) { + if (line.startsWith("INSERT INTO `sites` VALUES")) { + Matcher matcher = Pattern.compile("[(][^)]*[)]").matcher( + line.substring(27, line.length() - 1)); + while (matcher.find()) { + processSiteRow(matcher.group()); + } + break; // stop after finding rows + } + } + } catch (IOException e) { + MwSitesDumpFileProcessor.logger + .error("IO Error when processing dump of sites table: " + + e.toString()); + } + } + + /** + * Processes a row of the sites table and stores the site information found + * therein. + * + * @param siteRow + * string serialisation of a sites table row as found in the SQL + * dump + */ + void processSiteRow(String siteRow) { + String[] row = getSiteRowFields(siteRow); + + String filePath = ""; + String pagePath = ""; + + String dataArray = row[8].substring(row[8].indexOf('{'), + row[8].length() - 2); + + // Explanation for the regular expression below: + // "'{' or ';'" followed by either + // "NOT: ';', '{', or '}'" repeated one or more times; or + // "a single '}'" + // The first case matches ";s:5:\"paths\"" + // but also ";a:2:" in "{s:5:\"paths\";a:2:{s:9:\ ...". + // The second case matches ";}" which terminates (sub)arrays. + Matcher matcher = Pattern.compile("[{;](([^;}{][^;}{]*)|[}])").matcher( + dataArray); + String prevString = ""; + String curString = ""; + String path = ""; + boolean valuePosition = false; + + while (matcher.find()) { + String match = matcher.group().substring(1); + if (match.length() == 0) { + valuePosition = false; + continue; + } + if (match.charAt(0) == 's') { + valuePosition = !valuePosition && !"".equals(prevString); + curString = match.substring(match.indexOf('"') + 1, + match.length() - 2); + } else if (match.charAt(0) == 'a') { + valuePosition = false; + path = path + "/" + prevString; + } else if ("}".equals(match)) { + valuePosition = false; + path = path.substring(0, path.lastIndexOf('/')); + } + + if (valuePosition && "file_path".equals(prevString) + && "/paths".equals(path)) { + filePath = curString; + } else if (valuePosition && "page_path".equals(prevString) + && "/paths".equals(path)) { + pagePath = curString; + } + + prevString = curString; + curString = ""; + } + + MwSitesDumpFileProcessor.logger.info("Found site data \"" + row[1] + + "\" (group \"" + row[3] + "\", language \"" + row[5] + + "\", type \"" + row[2] + "\")"); + this.sites.setSiteInformation(row[1], row[3], row[5], row[2], "http:" + + filePath, "http:" + pagePath); + } + + /** + * Extract the individual fields for one row in the sites table. The entries + * are encoded by position, with the following meaning: 0: site_id, 1: + * site_global_key, 2: site_type, 3: site_group, 4: site_source 5: + * site_language, 6: site_protocol, 7: site_domain, 8: site_data, 9: + * site_forward, 10: site_config. The method assumes that this is the layout + * of the table, which is the case in MediaWiki 1.21 and above. + * + * @param siteRow + * the string representation of a row in the sites table, with + * the surrounding parentheses + * @return an array with the individual entries + */ + String[] getSiteRowFields(String siteRow) { + String[] siteRowFields = new String[11]; + + Matcher matcher = Pattern.compile("[(,](['][^']*[']|[^'][^),]*)") + .matcher(siteRow); + int columnIndex = 0; + while (matcher.find()) { + String field = matcher.group().substring(1); + if (field.charAt(0) == '\'') { + field = field.substring(1, field.length() - 1); + } + + siteRowFields[columnIndex] = field; + // ... will throw an exception if there are more fields than + // expected; this is fine. + columnIndex++; + } + return siteRowFields; + } +} diff --git a/wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/WmfDumpFile.java b/wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/WmfDumpFile.java index a13e521c0..d15381c80 100644 --- a/wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/WmfDumpFile.java +++ b/wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/WmfDumpFile.java @@ -4,6 +4,10 @@ import java.io.IOException; import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; +import java.util.HashMap; +import java.util.Map; + +import org.wikidata.wdtk.util.CompressionType; /* * #%L @@ -37,14 +41,59 @@ public abstract class WmfDumpFile implements MwDumpFile { * The default URL of the website to obtain the dump files from. */ static final String DUMP_SITE_BASE_URL = "http://dumps.wikimedia.org/"; + + /** + * Hash map defining the relative Web directory of each type of dump. + */ + static final Map WEB_DIRECTORY = new HashMap(); + static { + WmfDumpFile.WEB_DIRECTORY.put(DumpContentType.DAILY, "other/incr/"); + WmfDumpFile.WEB_DIRECTORY.put(DumpContentType.CURRENT, ""); + WmfDumpFile.WEB_DIRECTORY.put(DumpContentType.FULL, ""); + WmfDumpFile.WEB_DIRECTORY.put(DumpContentType.SITES, ""); + } + /** - * The subdirectory of the dumpfile site where daily dumps are kept. + * Hash map defining file name ending of each type of dump. */ - static final String DAILY_WEB_DIRECTORY = "other/incr/"; + static final Map POSTFIXES = new HashMap(); + static { + WmfDumpFile.POSTFIXES.put(DumpContentType.DAILY, + "-pages-meta-hist-incr.xml.bz2"); + WmfDumpFile.POSTFIXES.put(DumpContentType.CURRENT, + "-pages-meta-current.xml.bz2"); + WmfDumpFile.POSTFIXES.put(DumpContentType.FULL, + "-pages-meta-history.xml.bz2"); + WmfDumpFile.POSTFIXES.put(DumpContentType.SITES, "-sites.sql.gz"); + } + + /** + * Hash map defining the compression type of each type of dump. + */ + static final Map COMPRESSION_TYPE = new HashMap(); + static { + WmfDumpFile.COMPRESSION_TYPE.put(DumpContentType.DAILY, + CompressionType.BZ2); + WmfDumpFile.COMPRESSION_TYPE.put(DumpContentType.CURRENT, + CompressionType.BZ2); + WmfDumpFile.COMPRESSION_TYPE.put(DumpContentType.FULL, + CompressionType.BZ2); + WmfDumpFile.COMPRESSION_TYPE.put(DumpContentType.SITES, + CompressionType.GZIP); + } - static final String POSTFIX_DAILY_DUMP_FILE = "-pages-meta-hist-incr.xml.bz2"; - static final String POSTFIX_CURRENT_DUMP_FILE = "-pages-meta-current.xml.bz2"; - static final String POSTFIX_FULL_DUMP_FILE = "-pages-meta-history.xml.bz2"; + /** + * Hash map defining whether a certain type of dump is a dump of page + * revisions or not. Dumps with page revisions have a maximal revision id, + * while other dump files do not. + */ + static final Map REVISION_DUMP = new HashMap(); + static { + WmfDumpFile.REVISION_DUMP.put(DumpContentType.DAILY, true); + WmfDumpFile.REVISION_DUMP.put(DumpContentType.CURRENT, true); + WmfDumpFile.REVISION_DUMP.put(DumpContentType.FULL, true); + WmfDumpFile.REVISION_DUMP.put(DumpContentType.SITES, false); + } /** * The name of the file where a dump's maximal revision id should be stored @@ -126,15 +175,49 @@ public BufferedReader getDumpFileReader() throws IOException { * if the given dump file type is not known */ public static String getDumpFilePostfix(DumpContentType dumpContentType) { - switch (dumpContentType) { - case DAILY: - return WmfDumpFile.POSTFIX_DAILY_DUMP_FILE; - case CURRENT: - return WmfDumpFile.POSTFIX_CURRENT_DUMP_FILE; - case FULL: - return WmfDumpFile.POSTFIX_FULL_DUMP_FILE; - default: - throw new IllegalArgumentException("Unsupported dump type"); + if (WmfDumpFile.POSTFIXES.containsKey(dumpContentType)) { + return WmfDumpFile.POSTFIXES.get(dumpContentType); + } else { + throw new IllegalArgumentException("Unsupported dump type " + + dumpContentType); + } + } + + /** + * Returns the relative directory on the Web site where dumpfiles of the + * given type can be found. + * + * @param dumpContentType + * the type of dump + * @return relative web directory for the current dumpfiles + * @throws IllegalArgumentException + * if the given dump file type is not known + */ + public static String getDumpFileWebDirectory(DumpContentType dumpContentType) { + if (WmfDumpFile.WEB_DIRECTORY.containsKey(dumpContentType)) { + return WmfDumpFile.WEB_DIRECTORY.get(dumpContentType); + } else { + throw new IllegalArgumentException("Unsupported dump type " + + dumpContentType); + } + } + + /** + * Returns the compression type of this kind of dump file. + * + * @param dumpContentType + * the type of dump + * @return compression type + * @throws IllegalArgumentException + * if the given dump file type is not known + */ + public static CompressionType getDumpFileCompressionType( + DumpContentType dumpContentType) { + if (WmfDumpFile.COMPRESSION_TYPE.containsKey(dumpContentType)) { + return WmfDumpFile.COMPRESSION_TYPE.get(dumpContentType); + } else { + throw new IllegalArgumentException("Unsupported dump type " + + dumpContentType); } } @@ -187,4 +270,24 @@ public static String getDumpFileName(DumpContentType dumpContentType, + WmfDumpFile.getDumpFilePostfix(dumpContentType); } + /** + * Returns true if the given dump file type contains page revisions and + * false if it does not. Dumps that do not contain pages are for auxiliary + * information such as linked sites. + * + * @param dumpContentType + * the type of dump + * @return true if the dumpfile contains revisions + * @throws IllegalArgumentException + * if the given dump file type is not known + */ + public static boolean isRevisionDumpFile(DumpContentType dumpContentType) { + if (WmfDumpFile.REVISION_DUMP.containsKey(dumpContentType)) { + return WmfDumpFile.REVISION_DUMP.get(dumpContentType); + } else { + throw new IllegalArgumentException("Unsupported dump type " + + dumpContentType); + } + } + } \ No newline at end of file diff --git a/wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/WmfDumpFileManager.java b/wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/WmfDumpFileManager.java index 999c81cd9..b3e747265 100644 --- a/wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/WmfDumpFileManager.java +++ b/wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/WmfDumpFileManager.java @@ -40,9 +40,9 @@ /** * Class for providing access to available dumpfiles provided by the Wikimedia * Foundation. The preferred access point for this class if - * {@link #processAllRecentDumps(MwDumpFileProcessor, boolean)}, since this - * method takes care of freeing resources and might also provide parallelized - * downloading/processing in the future. + * {@link #processAllRecentRevisionDumps(MwDumpFileProcessor, boolean)}, since + * this method takes care of freeing resources and might also provide + * parallelized downloading/processing in the future. *

* Typically, the Web will be accessed to find information about dumps available * online. This Web access is mediated by a {@link WebResourceFetcherImpl} @@ -108,17 +108,16 @@ public WmfDumpFileManager(String projectName, } /** - * Processes all relevant dumps in order. For further details on the - * parameters, see {@link #findAllRelevantDumps(boolean)}. + * Processes all relevant page revision dumps in order. For further details + * on the parameters, see {@link #findAllRelevantRevisionDumps(boolean)}. * * @param preferCurrent * should dumps with current revisions be preferred? - * @return most recent main dump or null if no such dump exists */ - public void processAllRecentDumps(MwDumpFileProcessor dumpFileProcessor, - boolean preferCurrent) { + public void processAllRecentRevisionDumps( + MwDumpFileProcessor dumpFileProcessor, boolean preferCurrent) { - for (MwDumpFile dumpFile : findAllRelevantDumps(preferCurrent)) { + for (MwDumpFile dumpFile : findAllRelevantRevisionDumps(preferCurrent)) { try (InputStream inputStream = dumpFile.getDumpFileStream()) { logger.info("Processing dump file " + dumpFile.toString()); dumpFileProcessor @@ -137,8 +136,9 @@ public void processAllRecentDumps(MwDumpFileProcessor dumpFileProcessor, } /** - * Finds all dump files, online or locally, that are relevant to obtain the - * most current state of the data. + * Finds all page revision dump files, online or locally, that are relevant + * to obtain the most current state of the data. Revision dump files are + * dumps that contain page revisions in MediaWiki's XML format. *

* If the parameter preferCurrent is true, then dumps that contain * only the current versions of all files will be preferred if available @@ -158,15 +158,20 @@ public void processAllRecentDumps(MwDumpFileProcessor dumpFileProcessor, * should dumps with current revisions be preferred? * @return an ordered list of all dump files that match the given criteria */ - public List findAllRelevantDumps(boolean preferCurrent) { - MwDumpFile mainDump = findMostRecentMainDump(preferCurrent); + public List findAllRelevantRevisionDumps(boolean preferCurrent) { + MwDumpFile mainDump; + if (preferCurrent) { + mainDump = findMostRecentDump(DumpContentType.CURRENT); + } else { + mainDump = findMostRecentDump(DumpContentType.FULL); + } if (mainDump == null) { - return findAllDailyDumps(); + return findAllDumps(DumpContentType.DAILY); } List result = new ArrayList(); - for (MwDumpFile dumpFile : findAllDailyDumps()) { + for (MwDumpFile dumpFile : findAllDumps(DumpContentType.DAILY)) { if (dumpFile.getDateStamp().compareTo(mainDump.getDateStamp()) > 0) { result.add(dumpFile); } @@ -179,73 +184,35 @@ public List findAllRelevantDumps(boolean preferCurrent) { /** * Finds the most recent main dump (non-incremental dump). For further - * details on the parameters, see {@link #findAllRelevantDumps(boolean)}. + * details on the parameters, see + * {@link #findAllRelevantRevisionDumps(boolean)}. * - * @param preferCurrent - * should dumps with current revisions be preferred? + * @param dumpContentType + * the type of the dump to look for * @return most recent main dump or null if no such dump exists */ - public MwDumpFile findMostRecentMainDump(boolean preferCurrent) { - List mainDumps; - if (preferCurrent) { - mainDumps = findAllCurrentDumps(); - } else { - mainDumps = findAllFullDumps(); - } + public MwDumpFile findMostRecentDump(DumpContentType dumpContentType) { + List dumps = findAllDumps(dumpContentType); - if (mainDumps.size() == 0) { + if (dumps.size() == 0) { return null; } else { - return mainDumps.get(0); + return dumps.get(0); } } /** - * Returns a list of all daily dump files available either online or - * locally. For dumps available both online and locally, the local version - * is included. The list is order with most recent dump date first. + * Returns a list of all dump files of the given type available either + * online or locally. For dumps available both online and locally, the local + * version is included. The list is order with most recent dump date first. * - * @return a list of daily dump files + * @return a list of dump files of the given type */ - public List findAllDailyDumps() { - List localDumps = findDumpsLocally(DumpContentType.DAILY); + public List findAllDumps(DumpContentType dumpContentType) { + List localDumps = findDumpsLocally(dumpContentType); if (this.webResourceFetcher != null) { - List onlineDumps = findDailyDumpsOnline(); - return mergeDumpLists(localDumps, onlineDumps); - } else { - return localDumps; - } - } - - /** - * Returns a list of all current dump files available either online or - * locally. For dumps available both online and locally, the local version - * is included. The list is order with most recent dump date first. - * - * @return a list of current dump files - */ - public List findAllCurrentDumps() { - List localDumps = findDumpsLocally(DumpContentType.CURRENT); - if (this.webResourceFetcher != null) { - List onlineDumps = findCurrentDumpsOnline(); - return mergeDumpLists(localDumps, onlineDumps); - } else { - return localDumps; - } - } - - /** - * Finds a list of all full dump files available either online or locally. - * For dumps available both online and locally, the local version is - * included. The list is order with most recent dump date first. - * - * @return a list of full dump files - */ - public List findAllFullDumps() { - List localDumps = findDumpsLocally(DumpContentType.FULL); - if (this.webResourceFetcher != null) { - List onlineDumps = findFullDumpsOnline(); + List onlineDumps = findDumpsOnline(dumpContentType); return mergeDumpLists(localDumps, onlineDumps); } else { return localDumps; @@ -328,66 +295,29 @@ List findDumpsLocally(DumpContentType dumpContentType) { } /** - * Finds out which daily dump files are available for download. The result - * is a list of objects that describe the available dump files, in - * descending order by their date. Not all of the dumps included might be - * actually available. - * - * @return list of objects that provide information on available daily dumps - */ - List findDailyDumpsOnline() { - List dumpFileDates = findDumpDatesOnline(WmfDumpFile.DAILY_WEB_DIRECTORY); - - List result = new ArrayList(); - - for (String dateStamp : dumpFileDates) { - result.add(new WmfOnlineDailyDumpFile(dateStamp, this.projectName, - this.webResourceFetcher, this.dumpfileDirectoryManager)); - } - - return result; - } - - /** - * Finds out which current version dump files are available for download. + * Finds out which dump files of the given type are available for download. * The result is a list of objects that describe the available dump files, * in descending order by their date. Not all of the dumps included might be * actually available. * - * @return list of objects that provide information on available current - * dumps - */ - List findCurrentDumpsOnline() { - List dumpFileDates = findDumpDatesOnline(""); - - List result = new ArrayList(); - - for (String dateStamp : dumpFileDates) { - result.add(new WmfOnlineStandardDumpFile(dateStamp, - this.projectName, this.webResourceFetcher, - this.dumpfileDirectoryManager, DumpContentType.CURRENT)); - } - - return result; - } - - /** - * Finds out which full dump files are available for download. The result is - * a list of objects that describe the available dump files, in descending - * order by their date. Not all of the dumps included might be actually - * available. - * * @return list of objects that provide information on available full dumps */ - List findFullDumpsOnline() { - List dumpFileDates = findDumpDatesOnline(""); + List findDumpsOnline(DumpContentType dumpContentType) { + List dumpFileDates = findDumpDatesOnline(WmfDumpFile + .getDumpFileWebDirectory(dumpContentType)); List result = new ArrayList(); for (String dateStamp : dumpFileDates) { - result.add(new WmfOnlineStandardDumpFile(dateStamp, - this.projectName, this.webResourceFetcher, - this.dumpfileDirectoryManager, DumpContentType.FULL)); + if (dumpContentType == DumpContentType.DAILY) { + result.add(new WmfOnlineDailyDumpFile(dateStamp, + this.projectName, this.webResourceFetcher, + this.dumpfileDirectoryManager)); + } else { + result.add(new WmfOnlineStandardDumpFile(dateStamp, + this.projectName, this.webResourceFetcher, + this.dumpfileDirectoryManager, dumpContentType)); + } } return result; diff --git a/wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/WmfLocalDumpFile.java b/wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/WmfLocalDumpFile.java index 74e243488..9ff92089b 100644 --- a/wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/WmfLocalDumpFile.java +++ b/wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/WmfLocalDumpFile.java @@ -26,6 +26,7 @@ import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; +import org.wikidata.wdtk.util.CompressionType; import org.wikidata.wdtk.util.DirectoryManager; /** @@ -97,9 +98,12 @@ public DumpContentType getDumpContentType() { @Override public InputStream getDumpFileStream() throws IOException { - return this.localDumpfileDirectoryManager - .getInputStreamForBz2File(WmfDumpFile.getDumpFileName( - this.dumpContentType, this.projectName, this.dateStamp)); + String dumpFileName = WmfDumpFile.getDumpFileName(this.dumpContentType, + this.projectName, this.dateStamp); + + return this.localDumpfileDirectoryManager.getInputStreamForFile( + dumpFileName, + WmfDumpFile.getDumpFileCompressionType(this.dumpContentType)); } @Override @@ -109,9 +113,14 @@ public void prepareDumpFile() throws IOException { @Override protected Long fetchMaximalRevisionId() { + if (!WmfDumpFile.isRevisionDumpFile(this.dumpContentType)) { + return -1L; + } + String inputLine; try (InputStream in = this.localDumpfileDirectoryManager - .getInputStreamForFile(WmfDumpFile.LOCAL_FILENAME_MAXREVID)) { + .getInputStreamForFile(WmfDumpFile.LOCAL_FILENAME_MAXREVID, + CompressionType.NONE)) { BufferedReader bufferedReader = new BufferedReader( new InputStreamReader(in, StandardCharsets.UTF_8)); inputLine = bufferedReader.readLine(); @@ -135,7 +144,8 @@ protected boolean fetchIsDone() { return this.localDumpfileDirectoryManager.hasFile(WmfDumpFile .getDumpFileName(this.dumpContentType, this.projectName, this.dateStamp)) - && this.getMaximalRevisionId() >= 0; + && (this.getMaximalRevisionId() >= 0 || !WmfDumpFile + .isRevisionDumpFile(this.dumpContentType)); } } diff --git a/wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/WmfOnlineDailyDumpFile.java b/wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/WmfOnlineDailyDumpFile.java index 6bb6f92a8..8913819a3 100644 --- a/wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/WmfOnlineDailyDumpFile.java +++ b/wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/WmfOnlineDailyDumpFile.java @@ -83,7 +83,8 @@ public InputStream getDumpFileStream() throws IOException { .getSubdirectoryManager(WmfDumpFile.getDumpFileDirectoryName( DumpContentType.DAILY, this.dateStamp)); - return dailyDirectoryManager.getInputStreamForBz2File(fileName); + return dailyDirectoryManager.getInputStreamForFile(fileName, + WmfDumpFile.getDumpFileCompressionType(DumpContentType.DAILY)); } @Override @@ -161,7 +162,8 @@ protected boolean fetchIsDone() { * @return base URL */ String getBaseUrl() { - return WmfDumpFile.DUMP_SITE_BASE_URL + WmfDumpFile.DAILY_WEB_DIRECTORY + return WmfDumpFile.DUMP_SITE_BASE_URL + + WmfDumpFile.getDumpFileWebDirectory(DumpContentType.DAILY) + this.projectName + "/" + this.dateStamp + "/"; } diff --git a/wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/WmfOnlineStandardDumpFile.java b/wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/WmfOnlineStandardDumpFile.java index 4685b926e..686b6b693 100644 --- a/wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/WmfOnlineStandardDumpFile.java +++ b/wdtk-dumpfiles/src/main/java/org/wikidata/wdtk/dumpfiles/WmfOnlineStandardDumpFile.java @@ -83,7 +83,6 @@ public DumpContentType getDumpContentType() { @Override public InputStream getDumpFileStream() throws IOException { - prepareDumpFile(); String fileName = WmfDumpFile.getDumpFileName(this.dumpContentType, @@ -92,7 +91,8 @@ public InputStream getDumpFileStream() throws IOException { .getSubdirectoryManager(WmfDumpFile.getDumpFileDirectoryName( this.dumpContentType, this.dateStamp)); - return thisDumpDirectoryManager.getInputStreamForBz2File(fileName); + return thisDumpDirectoryManager.getInputStreamForFile(fileName, + WmfDumpFile.getDumpFileCompressionType(this.dumpContentType)); } @Override @@ -105,7 +105,8 @@ public void prepareDumpFile() throws IOException { + this.dumpContentType.toString().toLowerCase() + " dump file " + fileName + " from " + urlString + " ..."); - if (this.getMaximalRevisionId() == -1) { + if (WmfDumpFile.isRevisionDumpFile(this.dumpContentType) + && this.getMaximalRevisionId() == -1) { throw new IOException( "Failed to retrieve maximal revision id. Aborting dump retrieval."); } @@ -119,9 +120,11 @@ public void prepareDumpFile() throws IOException { thisDumpDirectoryManager.createFile(fileName, inputStream); } - thisDumpDirectoryManager.createFile( - WmfDumpFile.LOCAL_FILENAME_MAXREVID, this - .getMaximalRevisionId().toString()); + if (WmfDumpFile.isRevisionDumpFile(this.dumpContentType)) { + thisDumpDirectoryManager.createFile( + WmfDumpFile.LOCAL_FILENAME_MAXREVID, this + .getMaximalRevisionId().toString()); + } logger.info("... Completed download of " + this.dumpContentType.toString().toLowerCase() + " dump file " @@ -130,6 +133,10 @@ public void prepareDumpFile() throws IOException { @Override protected Long fetchMaximalRevisionId() { + if (!WmfDumpFile.isRevisionDumpFile(this.dumpContentType)) { + return -1L; + } + Long maxRevId = -1L; String urlString = getBaseUrl(); try (InputStream in = this.webResourceFetcher @@ -196,8 +203,9 @@ protected boolean fetchIsDone() { * @return base URL */ String getBaseUrl() { - return WmfDumpFile.DUMP_SITE_BASE_URL + this.projectName + "/" - + this.dateStamp + "/"; + return WmfDumpFile.DUMP_SITE_BASE_URL + + WmfDumpFile.getDumpFileWebDirectory(this.dumpContentType) + + this.projectName + "/" + this.dateStamp + "/"; } } diff --git a/wdtk-dumpfiles/src/test/java/org/wikidata/wdtk/dumpfiles/JsonConverterTest.java b/wdtk-dumpfiles/src/test/java/org/wikidata/wdtk/dumpfiles/JsonConverterTest.java index 52b1b868e..f913be41e 100644 --- a/wdtk-dumpfiles/src/test/java/org/wikidata/wdtk/dumpfiles/JsonConverterTest.java +++ b/wdtk-dumpfiles/src/test/java/org/wikidata/wdtk/dumpfiles/JsonConverterTest.java @@ -166,11 +166,10 @@ private ItemDocument createBasicItemDocument() { statementGroups.add(this.factory.getStatementGroup(statements)); Map siteLinks = new HashMap<>(); - List badges = new LinkedList<>(); - String siteKey = "enwiki"; - String title = "test"; - siteLinks.put("enwiki", - this.factory.getSiteLink(title, siteKey, "", badges)); + siteLinks.put( + "enwiki", + this.factory.getSiteLink("test", "enwiki", + Collections. emptyList())); ItemDocument document = this.factory.getItemDocument(itemIdValue, labels, descriptions, aliases, statementGroups, siteLinks); diff --git a/wdtk-dumpfiles/src/test/java/org/wikidata/wdtk/dumpfiles/MwDumpFileProcessingTest.java b/wdtk-dumpfiles/src/test/java/org/wikidata/wdtk/dumpfiles/MwDumpFileProcessingTest.java index 7c0fc926d..827f7a84d 100644 --- a/wdtk-dumpfiles/src/test/java/org/wikidata/wdtk/dumpfiles/MwDumpFileProcessingTest.java +++ b/wdtk-dumpfiles/src/test/java/org/wikidata/wdtk/dumpfiles/MwDumpFileProcessingTest.java @@ -168,7 +168,7 @@ public void testIncompleteDumpFile() throws IOException { TestMwRevisionProcessor tmrpAll = new TestMwRevisionProcessor(); mwrpBroker.registerMwRevisionProcessor(tmrpAll, null, false); - MwDumpFileProcessorImpl mwdfp = new MwDumpFileProcessorImpl(mwrpBroker); + MwRevisionDumpFileProcessor mwdfp = new MwRevisionDumpFileProcessor(mwrpBroker); mwdfp.processDumpFileContents(resourceUrl.openStream(), mockDumpFile); List revisionsAll = new ArrayList(); @@ -189,7 +189,7 @@ public void testBuggyDumpFile() throws IOException { TestMwRevisionProcessor tmrpAll = new TestMwRevisionProcessor(); mwrpBroker.registerMwRevisionProcessor(tmrpAll, null, false); - MwDumpFileProcessorImpl mwdfp = new MwDumpFileProcessorImpl(mwrpBroker); + MwRevisionDumpFileProcessor mwdfp = new MwRevisionDumpFileProcessor(mwrpBroker); mwdfp.processDumpFileContents(resourceUrl.openStream(), mockDumpFile); List revisionsAll = new ArrayList(); @@ -221,7 +221,7 @@ public void testMwDumpFileProcessing() throws IOException { mwrpBroker.registerMwRevisionProcessor(tmrpAllItems, "wikibase-item", false); - MwDumpFileProcessorImpl mwdfp = new MwDumpFileProcessorImpl(mwrpBroker); + MwRevisionDumpFileProcessor mwdfp = new MwRevisionDumpFileProcessor(mwrpBroker); mwdfp.processDumpFileContents(resourceUrl.openStream(), mockDumpFile); List revisionsAllItems = new ArrayList(); diff --git a/wdtk-dumpfiles/src/test/java/org/wikidata/wdtk/dumpfiles/WmfDumpFileManagerTest.java b/wdtk-dumpfiles/src/test/java/org/wikidata/wdtk/dumpfiles/WmfDumpFileManagerTest.java index cb9a3f5d5..6694e2aa6 100644 --- a/wdtk-dumpfiles/src/test/java/org/wikidata/wdtk/dumpfiles/WmfDumpFileManagerTest.java +++ b/wdtk-dumpfiles/src/test/java/org/wikidata/wdtk/dumpfiles/WmfDumpFileManagerTest.java @@ -103,8 +103,7 @@ void setLocalDump(String dateStamp, DumpContentType dumpContentType, public void getAllDailyDumps() throws IOException { wrf.setWebResourceContentsFromResource( "http://dumps.wikimedia.org/other/incr/wikidatawiki/", - "/other-incr-wikidatawiki-index.html", - MockWebResourceFetcher.TYPE_HTML, this.getClass()); + "/other-incr-wikidatawiki-index.html", this.getClass()); setLocalDump("20140220", DumpContentType.DAILY, true); setLocalDump("20140219", DumpContentType.CURRENT, true); @@ -116,7 +115,7 @@ public void getAllDailyDumps() throws IOException { "wikidatawiki", dm, wrf); List dumpFiles = dumpFileManager - .findAllDailyDumps(); + .findAllDumps(DumpContentType.DAILY); String[] dumpDates = { "20140221", "20140220", "20140219", "20140218", "20140217", "20140216", "20140215", "20140214", "20140213", @@ -131,9 +130,13 @@ public void getAllDailyDumps() throws IOException { DumpContentType.DAILY); assertEquals(dumpFiles.get(i).getDateStamp(), dumpDates[i]); if (dumpIsLocal[i]) { - assertTrue(dumpFiles.get(i) instanceof WmfLocalDumpFile); + assertTrue( + "Dumpfile " + dumpFiles.get(i) + " should be local.", + dumpFiles.get(i) instanceof WmfLocalDumpFile); } else { - assertTrue(dumpFiles.get(i) instanceof WmfOnlineDailyDumpFile); + assertTrue("Dumpfile " + dumpFiles.get(i) + + " should be online.", + dumpFiles.get(i) instanceof WmfOnlineDailyDumpFile); } } } @@ -142,8 +145,7 @@ public void getAllDailyDumps() throws IOException { public void getAllCurrentDumps() throws IOException { wrf.setWebResourceContentsFromResource( "http://dumps.wikimedia.org/wikidatawiki/", - "/wikidatawiki-index.html", MockWebResourceFetcher.TYPE_HTML, - this.getClass()); + "/wikidatawiki-index.html", this.getClass()); setLocalDump("20140210", DumpContentType.CURRENT, false); setLocalDump("20140123", DumpContentType.CURRENT, true); @@ -155,7 +157,7 @@ public void getAllCurrentDumps() throws IOException { "wikidatawiki", dm, wrf); List dumpFiles = dumpFileManager - .findAllCurrentDumps(); + .findAllDumps(DumpContentType.CURRENT); String[] dumpDates = { "20140210", "20140123", "20140106", "20131221", "20131201" }; @@ -178,8 +180,7 @@ public void getAllCurrentDumps() throws IOException { public void getAllFullDumps() throws IOException { wrf.setWebResourceContentsFromResource( "http://dumps.wikimedia.org/wikidatawiki/", - "/wikidatawiki-index.html", MockWebResourceFetcher.TYPE_HTML, - this.getClass()); + "/wikidatawiki-index.html", this.getClass()); setLocalDump("20140210", DumpContentType.FULL, false); setLocalDump("20140123", DumpContentType.FULL, true); @@ -191,7 +192,7 @@ public void getAllFullDumps() throws IOException { "wikidatawiki", dm, wrf); List dumpFiles = dumpFileManager - .findAllFullDumps(); + .findAllDumps(DumpContentType.FULL); String[] dumpDates = { "20140210", "20140123", "20140106", "20131221", "20131201" }; @@ -219,7 +220,7 @@ public void getAllDailyDumpsOffline() throws IOException { "wikidatawiki", dm, null); List dumpFiles = dumpFileManager - .findAllDailyDumps(); + .findAllDumps(DumpContentType.DAILY); String[] dumpDates = { "20140220", "20140205" }; @@ -241,7 +242,7 @@ public void getAllCurrentDumpsOffline() throws IOException { "wikidatawiki", dm, null); List dumpFiles = dumpFileManager - .findAllCurrentDumps(); + .findAllDumps(DumpContentType.CURRENT); String[] dumpDates = { "20140220", "20140205" }; @@ -263,7 +264,7 @@ public void getAllFullDumpsOffline() throws IOException { "wikidatawiki", dm, null); List dumpFiles = dumpFileManager - .findAllFullDumps(); + .findAllDumps(DumpContentType.FULL); String[] dumpDates = { "20140220", "20140205" }; @@ -280,12 +281,10 @@ public void getAllFullDumpsOffline() throws IOException { public void getAllRelevantDumps() throws IOException { wrf.setWebResourceContentsFromResource( "http://dumps.wikimedia.org/other/incr/wikidatawiki/", - "/other-incr-wikidatawiki-index.html", - MockWebResourceFetcher.TYPE_HTML, this.getClass()); + "/other-incr-wikidatawiki-index.html", this.getClass()); wrf.setWebResourceContentsFromResource( "http://dumps.wikimedia.org/wikidatawiki/", - "/wikidatawiki-index.html", MockWebResourceFetcher.TYPE_HTML, - this.getClass()); + "/wikidatawiki-index.html", this.getClass()); setLocalDump("20140220", DumpContentType.DAILY, true); setLocalDump("20140219", DumpContentType.FULL, true); @@ -295,7 +294,7 @@ public void getAllRelevantDumps() throws IOException { "wikidatawiki", dm, wrf); List dumpFiles = dumpFileManager - .findAllRelevantDumps(true); + .findAllRelevantRevisionDumps(true); String[] dumpDates = { "20140221", "20140220", "20140219", "20140218", "20140217", "20140216", "20140215", "20140214", "20140213", @@ -330,7 +329,7 @@ public void getAllRelevantDumpsMainDumpMissing() throws IOException { "wikidatawiki", dm, wrf); List dumpFiles = dumpFileManager - .findAllRelevantDumps(false); + .findAllRelevantRevisionDumps(false); assertEquals(dumpFiles.size(), 1); assertEquals(dumpFiles.get(0).getDumpContentType(), @@ -351,7 +350,7 @@ public void processAllRelevantDumps() throws IOException { TestDumpfileProcessor dfp = new TestDumpfileProcessor(); - dumpFileManager.processAllRecentDumps(dfp, true); + dumpFileManager.processAllRecentRevisionDumps(dfp, true); assertEquals( dfp.result, diff --git a/wdtk-dumpfiles/src/test/java/org/wikidata/wdtk/dumpfiles/WmfOnlineDailyDumpFileTest.java b/wdtk-dumpfiles/src/test/java/org/wikidata/wdtk/dumpfiles/WmfOnlineDailyDumpFileTest.java index 3c61a87d3..4e1ee5cba 100644 --- a/wdtk-dumpfiles/src/test/java/org/wikidata/wdtk/dumpfiles/WmfOnlineDailyDumpFileTest.java +++ b/wdtk-dumpfiles/src/test/java/org/wikidata/wdtk/dumpfiles/WmfOnlineDailyDumpFileTest.java @@ -50,17 +50,14 @@ public void validDumpProperties() throws IOException { String maxRevId = "110690987"; wrf.setWebResourceContents( "http://dumps.wikimedia.org/other/incr/wikidatawiki/" - + dateStamp + "/maxrevid.txt", maxRevId, - MockWebResourceFetcher.TYPE_HTML); + + dateStamp + "/maxrevid.txt", maxRevId); wrf.setWebResourceContents( "http://dumps.wikimedia.org/other/incr/wikidatawiki/" - + dateStamp + "/status.txt", "done", - MockWebResourceFetcher.TYPE_HTML); + + dateStamp + "/status.txt", "done"); wrf.setWebResourceContents( "http://dumps.wikimedia.org/other/incr/wikidatawiki/" + dateStamp + "/wikidatawiki-" + dateStamp - + "-pages-meta-hist-incr.xml.bz2", "Line1", - MockWebResourceFetcher.TYPE_HTML); + + "-pages-meta-hist-incr.xml.bz2", "Line1"); WmfOnlineDailyDumpFile dump = new WmfOnlineDailyDumpFile(dateStamp, "wikidatawiki", wrf, dm); @@ -93,12 +90,10 @@ public void emptyDumpProperties() { String dateStamp = "20140220"; wrf.setWebResourceContents( "http://dumps.wikimedia.org/other/incr/wikidatawiki/" - + dateStamp + "/maxrevid.txt", "", - MockWebResourceFetcher.TYPE_HTML); + + dateStamp + "/maxrevid.txt", ""); wrf.setWebResourceContents( "http://dumps.wikimedia.org/other/incr/wikidatawiki/" - + dateStamp + "/status.txt", "", - MockWebResourceFetcher.TYPE_HTML); + + dateStamp + "/status.txt", ""); WmfOnlineDailyDumpFile dump = new WmfOnlineDailyDumpFile(dateStamp, "wikidatawiki", wrf, dm); @@ -114,8 +109,7 @@ public void malformedRevisionId() { "wikidatawiki", wrf, dm); wrf.setWebResourceContents( "http://dumps.wikimedia.org/other/incr/wikidatawiki/" - + dateStamp + "/maxrevid.txt", "nan", - MockWebResourceFetcher.TYPE_HTML); + + dateStamp + "/maxrevid.txt", "nan"); assertEquals(dump.getMaximalRevisionId(), new Long(-1)); } @@ -127,8 +121,7 @@ public void inaccessibleRevisionId() { "wikidatawiki", wrf, dm); wrf.setWebResourceContents( "http://dumps.wikimedia.org/other/incr/wikidatawiki/" - + dateStamp + "/maxrevid.txt", "1234567", - MockWebResourceFetcher.TYPE_HTML); + + dateStamp + "/maxrevid.txt", "1234567"); wrf.setReturnFailingReaders(true); assertEquals(dump.getMaximalRevisionId(), new Long(-1)); @@ -139,12 +132,10 @@ public void inaccessibleStatus() { String dateStamp = "20140220"; wrf.setWebResourceContents( "http://dumps.wikimedia.org/other/incr/wikidatawiki/" - + dateStamp + "/maxrevid.txt", "1234567", - MockWebResourceFetcher.TYPE_HTML); + + dateStamp + "/maxrevid.txt", "1234567"); wrf.setWebResourceContents( "http://dumps.wikimedia.org/other/incr/wikidatawiki/" - + dateStamp + "/status.txt", "done", - MockWebResourceFetcher.TYPE_HTML); + + dateStamp + "/status.txt", "done"); wrf.setReturnFailingReaders(true); WmfOnlineDailyDumpFile dump = new WmfOnlineDailyDumpFile(dateStamp, "wikidatawiki", wrf, dm); @@ -158,8 +149,7 @@ public void downloadNoRevisionId() throws IOException { wrf.setWebResourceContents( "http://dumps.wikimedia.org/other/incr/wikidatawiki/" + dateStamp + "/wikidatawiki-" + dateStamp - + "-pages-meta-hist-incr.xml.bz2", "Line1", - MockWebResourceFetcher.TYPE_HTML); + + "-pages-meta-hist-incr.xml.bz2", "Line1"); WmfOnlineDailyDumpFile dump = new WmfOnlineDailyDumpFile(dateStamp, "wikidatawiki", wrf, dm); dump.getDumpFileReader(); @@ -171,12 +161,10 @@ public void downloadNoDumpFile() throws IOException { String maxRevId = "110690987"; wrf.setWebResourceContents( "http://dumps.wikimedia.org/other/incr/wikidatawiki/" - + dateStamp + "/maxrevid.txt", maxRevId, - MockWebResourceFetcher.TYPE_HTML); + + dateStamp + "/maxrevid.txt", maxRevId); wrf.setWebResourceContents( "http://dumps.wikimedia.org/other/incr/wikidatawiki/" - + dateStamp + "/status.txt", "done", - MockWebResourceFetcher.TYPE_HTML); + + dateStamp + "/status.txt", "done"); WmfOnlineDailyDumpFile dump = new WmfOnlineDailyDumpFile(dateStamp, "wikidatawiki", wrf, dm); dump.getDumpFileReader(); diff --git a/wdtk-dumpfiles/src/test/java/org/wikidata/wdtk/dumpfiles/WmfOnlineStandardDumpFileTest.java b/wdtk-dumpfiles/src/test/java/org/wikidata/wdtk/dumpfiles/WmfOnlineStandardDumpFileTest.java index 9e728586c..3ed8b30c2 100644 --- a/wdtk-dumpfiles/src/test/java/org/wikidata/wdtk/dumpfiles/WmfOnlineStandardDumpFileTest.java +++ b/wdtk-dumpfiles/src/test/java/org/wikidata/wdtk/dumpfiles/WmfOnlineStandardDumpFileTest.java @@ -48,15 +48,13 @@ public void setUp() throws IOException { public void validCurrentDumpProperties() throws IOException { wrf.setWebResourceContentsFromResource( "http://dumps.wikimedia.org/wikidatawiki/20140210/", - "/wikidatawiki-20140210-index.html", - MockWebResourceFetcher.TYPE_HTML, this.getClass()); + "/wikidatawiki-20140210-index.html", this.getClass()); wrf.setWebResourceContents( "http://dumps.wikimedia.org/wikidatawiki/20140210/wikidatawiki-20140210-pages-meta-current.xml.bz2", - "Line1", MockWebResourceFetcher.TYPE_BZ2); + "Line1"); wrf.setWebResourceContentsFromResource( "http://dumps.wikimedia.org/wikidatawiki/20140210/wikidatawiki-20140210-md5sums.txt", - "/wikidatawiki-20140210-md5sums.txt", - MockWebResourceFetcher.TYPE_HTML, this.getClass()); + "/wikidatawiki-20140210-md5sums.txt", this.getClass()); MwDumpFile dump = new WmfOnlineStandardDumpFile("20140210", "wikidatawiki", wrf, dm, DumpContentType.CURRENT); @@ -84,15 +82,13 @@ public void missingFullDumpProperties() { public void inaccessibleCurrentDumpProperties() throws IOException { wrf.setWebResourceContentsFromResource( "http://dumps.wikimedia.org/wikidatawiki/20140210/", - "/wikidatawiki-20140210-index.html", - MockWebResourceFetcher.TYPE_HTML, this.getClass()); + "/wikidatawiki-20140210-index.html", this.getClass()); wrf.setWebResourceContents( "http://dumps.wikimedia.org/wikidatawiki/20140210/wikidatawiki-20140210-pages-meta-current.xml.bz2", - "Line1", MockWebResourceFetcher.TYPE_BZ2); + "Line1"); wrf.setWebResourceContentsFromResource( "http://dumps.wikimedia.org/wikidatawiki/20140210/wikidatawiki-20140210-md5sums.txt", - "/wikidatawiki-20140210-md5sums.txt", - MockWebResourceFetcher.TYPE_HTML, this.getClass()); + "/wikidatawiki-20140210-md5sums.txt", this.getClass()); wrf.setReturnFailingReaders(true); MwDumpFile dump = new WmfOnlineStandardDumpFile("20140210", @@ -106,8 +102,7 @@ public void inaccessibleCurrentDumpProperties() throws IOException { public void emptyFullDumpIsDone() throws IOException { wrf.setWebResourceContentsFromResource( "http://dumps.wikimedia.org/wikidatawiki/20140210/", - "/wikidatawiki-20140210-index.html", - MockWebResourceFetcher.TYPE_HTML, this.getClass()); + "/wikidatawiki-20140210-index.html", this.getClass()); MwDumpFile dump = new WmfOnlineStandardDumpFile("20140210", "wikidatawiki", wrf, dm, DumpContentType.FULL); @@ -129,7 +124,7 @@ public void emptyFullDumpRevisionId() throws IOException { public void downloadNoRevisionId() throws IOException { wrf.setWebResourceContents( "http://dumps.wikimedia.org/wikidatawiki/20140210/wikidatawiki-20140210-pages-meta-current.xml.bz2", - "Line1", MockWebResourceFetcher.TYPE_BZ2); + "Line1"); MwDumpFile dump = new WmfOnlineStandardDumpFile("20140210", "wikidatawiki", wrf, dm, DumpContentType.FULL); dump.getDumpFileReader(); @@ -139,12 +134,10 @@ public void downloadNoRevisionId() throws IOException { public void downloadNoDumpFile() throws IOException { wrf.setWebResourceContentsFromResource( "http://dumps.wikimedia.org/wikidatawiki/20140210/", - "/wikidatawiki-20140210-index.html", - MockWebResourceFetcher.TYPE_HTML, this.getClass()); + "/wikidatawiki-20140210-index.html", this.getClass()); wrf.setWebResourceContentsFromResource( "http://dumps.wikimedia.org/wikidatawiki/20140210/wikidatawiki-20140210-md5sums.txt", - "/wikidatawiki-20140210-md5sums.txt", - MockWebResourceFetcher.TYPE_HTML, this.getClass()); + "/wikidatawiki-20140210-md5sums.txt", this.getClass()); MwDumpFile dump = new WmfOnlineStandardDumpFile("20140210", "wikidatawiki", wrf, dm, DumpContentType.CURRENT); dump.getDumpFileReader(); diff --git a/wdtk-examples/src/main/java/org/wikidata/wdtk/examples/DumpProcessingExample.java b/wdtk-examples/src/main/java/org/wikidata/wdtk/examples/DumpProcessingExample.java index c7530a0f8..40e3f78ec 100644 --- a/wdtk-examples/src/main/java/org/wikidata/wdtk/examples/DumpProcessingExample.java +++ b/wdtk-examples/src/main/java/org/wikidata/wdtk/examples/DumpProcessingExample.java @@ -31,8 +31,8 @@ import org.wikidata.wdtk.datamodel.interfaces.PropertyDocument; import org.wikidata.wdtk.datamodel.interfaces.StatementGroup; import org.wikidata.wdtk.dumpfiles.MwDumpFileProcessor; -import org.wikidata.wdtk.dumpfiles.MwDumpFileProcessorImpl; import org.wikidata.wdtk.dumpfiles.MwRevision; +import org.wikidata.wdtk.dumpfiles.MwRevisionDumpFileProcessor; import org.wikidata.wdtk.dumpfiles.MwRevisionProcessor; import org.wikidata.wdtk.dumpfiles.MwRevisionProcessorBroker; import org.wikidata.wdtk.dumpfiles.StatisticsMwRevisionProcessor; @@ -69,7 +69,7 @@ public static void main(String[] args) throws IOException { MwDumpFileProcessor dumpFileProcessor = createDumpFileProcessor(); // Start processing (may trigger downloads where needed) - dumpFileManager.processAllRecentDumps(dumpFileProcessor, true); + dumpFileManager.processAllRecentRevisionDumps(dumpFileProcessor, true); } /** @@ -148,7 +148,7 @@ private static MwDumpFileProcessor createDumpFileProcessor() { rpBroker.registerMwRevisionProcessor(rpRevisionStats, null, true); // Object to parse XML dumps to send page revisions to our broker: - return new MwDumpFileProcessorImpl(rpBroker); + return new MwRevisionDumpFileProcessor(rpBroker); } /** diff --git a/wdtk-examples/src/main/java/org/wikidata/wdtk/examples/SitelinksExample.java b/wdtk-examples/src/main/java/org/wikidata/wdtk/examples/SitelinksExample.java new file mode 100644 index 000000000..fd06f9a83 --- /dev/null +++ b/wdtk-examples/src/main/java/org/wikidata/wdtk/examples/SitelinksExample.java @@ -0,0 +1,191 @@ +package org.wikidata.wdtk.examples; + +/* + * #%L + * Wikidata Toolkit Examples + * %% + * Copyright (C) 2014 Wikidata Toolkit Developers + * %% + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * #L% + */ + +import java.io.IOException; + +import org.apache.log4j.ConsoleAppender; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.apache.log4j.PatternLayout; +import org.wikidata.wdtk.datamodel.interfaces.SiteLink; +import org.wikidata.wdtk.datamodel.interfaces.Sites; +import org.wikidata.wdtk.dumpfiles.DumpContentType; +import org.wikidata.wdtk.dumpfiles.MwDumpFile; +import org.wikidata.wdtk.dumpfiles.MwSitesDumpFileProcessor; +import org.wikidata.wdtk.dumpfiles.WmfDumpFileManager; +import org.wikidata.wdtk.util.DirectoryManager; +import org.wikidata.wdtk.util.DirectoryManagerImpl; +import org.wikidata.wdtk.util.WebResourceFetcher; +import org.wikidata.wdtk.util.WebResourceFetcherImpl; + +/** + * This class demonstrates how to get access to information about sitelinks in + * Wikidata.org. The data generally uses keys like "enwiki" to identify sites. + * To find out what these keys mean, Wikidata Toolkit can be used to download + * and process the dump of the MediaWiki sites table. The resulting + * {@link Sites} object can be used to resolve links to other sites, and also + * can be applied to {@link SiteLink} objects as found in the Wikidata data. + * Other information obtained from the sites table includes the site language, + * whether it is a MediaWiki site, and which group it has been assigned to. The + * groups are used to define which sites can be used for entering site links in + * Wikibase, but the sites table does not tell us which groups are currently + * enabled for site links. + * + * @author Markus Kroetzsch + * + */ +public class SitelinksExample { + + public static void main(String[] args) throws IOException { + + // Define where log messages go + configureLogging(); + + // Print information about this program + printDocumentation(); + + // Create object to get hold of Wikidata.org dumpfiles + WmfDumpFileManager dumpFileManager = createDumpFileManager(); + + // Download the sites table dump and extract information + Sites sites = getSitesInformation(dumpFileManager); + + // Access the data to find some information + System.out + .println("********************************************************************"); + System.out.println("*** Completed processing of sites table."); + System.out.println("*** Examples:"); + System.out + .println("*** URL of the page \"Dresden\" on German Wikipedia: " + + sites.getPageUrl("dewiki", "Dresden")); + System.out + .println("*** URL of the page \"ڈگلس ایڈم\" on Urdu Wikipedia: " + + sites.getPageUrl("urwiki", "ڈگلس ایڈم")); + System.out + .println("*** URL of the page \"Special:EntityData/Q1.json\" on Wikidata: " + + sites.getPageUrl("wikidatawiki", + "Special:EntityData/Q1.json")); + System.out + .println("*** Main language of the site identified by \"frwikiquote\": " + + sites.getLanguageCode("frwikiquote")); + System.out + .println("*** Group of the site identified by \"zhwikivoyage\": " + + sites.getGroup("zhwikivoyage")); + System.out + .println("*** URL of the file \"api.php\" on English Wikipedia: " + + sites.getFileUrl("enwiki", "api.php")); + } + + /** + * Creates an object that manages dumpfiles published by the Wikimedia + * Foundation. This object will check for available complete and incremental + * dump files, both online and in a local download directory. It provides + * direct access to the (decompressed) string content of these files. + *

+ * The details in this method define which download directory is to be used, + * which Wikimedia project we are interested in (Wikidata), and that we want + * to allow online access (instead of using local files only). + * + * @return dump file manager + * @throws IOException + * if the download directory is not accessible + */ + private static WmfDumpFileManager createDumpFileManager() + throws IOException { + // The following can also be set to another directory: + String downloadDirectory = System.getProperty("user.dir"); + DirectoryManager downloadDirectoryManager = new DirectoryManagerImpl( + downloadDirectory); + + // The following can be set to null for offline operation: + WebResourceFetcher webResourceFetcher = new WebResourceFetcherImpl(); + + // The string "wikidatawiki" identifies Wikidata.org: + return new WmfDumpFileManager("wikidatawiki", downloadDirectoryManager, + webResourceFetcher); + } + + /** + * Processes the most recent dump of the sites table to extract information + * about registered sites. + * + * @param dumpFileManager + * the dump file manager used to access the dump + * @return a Sites objects that contains the extracted information + * @throws IOException + */ + private static Sites getSitesInformation(WmfDumpFileManager dumpFileManager) + throws IOException { + // Get a handle for the most recent dump file of the sites table: + MwDumpFile sitesTableDump = dumpFileManager + .findMostRecentDump(DumpContentType.SITES); + + // Create a suitable processor for such dumps and process the file: + MwSitesDumpFileProcessor sitesDumpFileProcessor = new MwSitesDumpFileProcessor(); + sitesDumpFileProcessor.processDumpFileContents( + sitesTableDump.getDumpFileStream(), sitesTableDump); + + // Return the result: + return sitesDumpFileProcessor.getSites(); + } + + /** + * Defines how messages should be logged. This method can be modified to + * restrict the logging messages that are shown on the console or to change + * their formatting. See the documentation of Log4J for details on how to do + * this. + */ + private static void configureLogging() { + // Create the appender that will write log messages to the console. + ConsoleAppender consoleAppender = new ConsoleAppender(); + // Define the pattern of log messages. + // Insert the string "%c{1}:%L" to also show class name and line. + String pattern = "%d{yyyy-MM-dd HH:mm:ss} %-5p - %m%n"; + consoleAppender.setLayout(new PatternLayout(pattern)); + // Change to Level.ERROR for fewer messages: + consoleAppender.setThreshold(Level.INFO); + + consoleAppender.activateOptions(); + Logger.getRootLogger().addAppender(consoleAppender); + } + + /** + * Print some basic documentation about this program. + */ + private static void printDocumentation() { + System.out + .println("********************************************************************"); + System.out.println("*** Wikidata Toolkit: Sitelink Processing Example"); + System.out.println("*** "); + System.out + .println("*** This program will download and process site link information from"); + System.out + .println("*** Wikidata. Downloaded files are stored on disk and are used until"); + System.out + .println("*** newer dump are available. You can delete files manually when no"); + System.out + .println("*** longer needed (see message below for the directory where files are found)."); + System.out + .println("********************************************************************"); + } + +} diff --git a/wdtk-testing/src/main/java/org/wikidata/wdtk/testing/MockDirectoryManager.java b/wdtk-testing/src/main/java/org/wikidata/wdtk/testing/MockDirectoryManager.java index 7978dbff6..ed3d66baa 100644 --- a/wdtk-testing/src/main/java/org/wikidata/wdtk/testing/MockDirectoryManager.java +++ b/wdtk-testing/src/main/java/org/wikidata/wdtk/testing/MockDirectoryManager.java @@ -34,6 +34,7 @@ import java.util.HashMap; import java.util.List; +import org.wikidata.wdtk.util.CompressionType; import org.wikidata.wdtk.util.DirectoryManager; /** @@ -166,22 +167,22 @@ public void createFile(String fileName, String fileContents) } @Override - public InputStream getInputStreamForFile(String fileName) - throws IOException { - if (fileName.endsWith(".bz2")) { + public InputStream getInputStreamForFile(String fileName, + CompressionType compressionType) throws IOException { + if (compressionType == CompressionType.GZIP + && !fileName.endsWith(".gz")) { throw new IllegalArgumentException( - "Cannot read bz2 files with this method"); - } - return getInputStreamForMockFile(fileName); - } - - @Override - public InputStream getInputStreamForBz2File(String fileName) - throws IOException { - if (!fileName.endsWith(".bz2")) { + "Can only read gz files with this compression type"); + } else if (compressionType == CompressionType.BZ2 + && !fileName.endsWith(".bz2")) { throw new IllegalArgumentException( - "Can only read bz2 files with this method"); + "Can only read bz2 files with this compression type"); + } else if (compressionType == CompressionType.NONE + && (fileName.endsWith(".bz2") || fileName.endsWith(".gz"))) { + throw new IllegalArgumentException( + "Cannot read compressed files with this compression type"); } + return getInputStreamForMockFile(fileName); } diff --git a/wdtk-testing/src/main/java/org/wikidata/wdtk/testing/MockWebResourceFetcher.java b/wdtk-testing/src/main/java/org/wikidata/wdtk/testing/MockWebResourceFetcher.java index 8cf5886cd..1e44e5b4d 100644 --- a/wdtk-testing/src/main/java/org/wikidata/wdtk/testing/MockWebResourceFetcher.java +++ b/wdtk-testing/src/main/java/org/wikidata/wdtk/testing/MockWebResourceFetcher.java @@ -35,13 +35,7 @@ */ public class MockWebResourceFetcher implements WebResourceFetcher { - public static final String TYPE_HTML = "html"; - public static final String TYPE_GZIP = "gz"; - public static final String TYPE_BZ2 = "bz2"; - public static final String TYPE_ANY = "any"; - final HashMap webResources; - final HashMap webResourceTypes; boolean returnFailingReaders; /** @@ -52,7 +46,6 @@ public class MockWebResourceFetcher implements WebResourceFetcher { */ public MockWebResourceFetcher() { this.webResources = new HashMap(); - this.webResourceTypes = new HashMap(); } /** @@ -69,37 +62,25 @@ public void setReturnFailingReaders(boolean returnFailingReaders) { } /** - * Defines the contents of a new web resource. The contents type is used to - * define which methods are allowed to access this contents. All contents is - * stored as plain text, but contents of type {@link #TYPE_GZIP} can only be - * accessed when using a suitable method, etc. + * Defines the contents of a new web resource. * * @param url * the URL string * @param contents * the string contents - * @param contentsType - * one of the predefined type constants */ - public void setWebResourceContents(String url, String contents, - String contentsType) { + public void setWebResourceContents(String url, String contents) { this.webResources.put(url, contents); - this.webResourceTypes.put(url, contentsType); } /** * Defines the contents of a new web resource by taking the string from a - * given (Java) resource. The contents type is used to define which methods - * are allowed to access this contents. All contents is stored as plain - * text, but contents of type {@link #TYPE_GZIP} can only be accessed when - * using a suitable method, etc. + * given (Java) resource. * * @param url * the URL string * @param resource * the Java resource name - * @param contentsType - * one of the predefined type constants * @param resourceClass * the Class relative to which the resource should be resolved * (since resources are stored relative to a classpath); can @@ -108,50 +89,33 @@ public void setWebResourceContents(String url, String contents, * if the Java resource could not be loaded */ public void setWebResourceContentsFromResource(String url, String resource, - String contentsType, Class resourceClass) throws IOException { + Class resourceClass) throws IOException { URL resourceUrl = resourceClass.getResource(resource); String contents = MockStringContentFactory .getStringFromUrl(resourceUrl); - setWebResourceContents(url, contents, contentsType); + setWebResourceContents(url, contents); } @Override public InputStream getInputStreamForUrl(String urlString) throws IOException { - return getInputStreamForMockWebResource(urlString, - MockWebResourceFetcher.TYPE_ANY); - } - - @Override - public InputStream getInputStreamForGzipUrl(String urlString) - throws IOException { - return getInputStreamForMockWebResource(urlString, - MockWebResourceFetcher.TYPE_GZIP); + return getInputStreamForMockWebResource(urlString); } /** * Returns an input stream for the content mocked for given URL. It is - * checked that the URL is valid and that the type of its content matches - * the given one. + * checked that the URL is valid. * * @param urlString - * @param resourceType - * expected type * @return input stream for resource * @throws IOException */ - InputStream getInputStreamForMockWebResource(String urlString, - String resourceType) throws IOException { + InputStream getInputStreamForMockWebResource(String urlString) + throws IOException { if (!this.webResources.containsKey(urlString)) { throw new IOException("Inaccessible URL (not mocked)"); } - if (!resourceType.equals(MockWebResourceFetcher.TYPE_ANY) - && !resourceType.equals(this.webResourceTypes.get(urlString))) { - throw new IllegalArgumentException( - "Can only access content of type " + resourceType - + " but was " - + this.webResourceTypes.get(urlString) + "."); - } + if (this.returnFailingReaders) { return MockStringContentFactory.getFailingInputStream(); } else { diff --git a/wdtk-testing/src/test/java/org/wikidata/wdtk/testing/MockDirectoryManagerTest.java b/wdtk-testing/src/test/java/org/wikidata/wdtk/testing/MockDirectoryManagerTest.java index c55ffe548..69f36f752 100644 --- a/wdtk-testing/src/test/java/org/wikidata/wdtk/testing/MockDirectoryManagerTest.java +++ b/wdtk-testing/src/test/java/org/wikidata/wdtk/testing/MockDirectoryManagerTest.java @@ -34,6 +34,7 @@ import org.junit.Before; import org.junit.Test; +import org.wikidata.wdtk.util.CompressionType; import org.wikidata.wdtk.util.DirectoryManager; public class MockDirectoryManagerTest { @@ -50,6 +51,9 @@ public void setUp() throws Exception { mdm.setFileContents( basePath.resolve("anotherdir").resolve("test.txt.bz2"), "Test BZ2 contents\nMore contents"); + mdm.setFileContents( + basePath.resolve("anotherdir").resolve("test.txt.gz"), + "Test GZIP contents"); } @Test @@ -93,8 +97,8 @@ public void getSubdirectories() throws IOException { public void readFile() throws IOException { DirectoryManager submdm = mdm.getSubdirectoryManager("dir2"); String content = MockStringContentFactory - .getStringFromInputStream(submdm - .getInputStreamForFile("test.txt")); + .getStringFromInputStream(submdm.getInputStreamForFile( + "test.txt", CompressionType.NONE)); assertEquals(content, "Test contents"); } @@ -102,18 +106,27 @@ public void readFile() throws IOException { public void readBz2File() throws IOException { DirectoryManager submdm = mdm.getSubdirectoryManager("anotherdir"); String content = MockStringContentFactory - .getStringFromInputStream(submdm - .getInputStreamForBz2File("test.txt.bz2")); + .getStringFromInputStream(submdm.getInputStreamForFile( + "test.txt.bz2", CompressionType.BZ2)); assertEquals(content, "Test BZ2 contents\nMore contents"); } + @Test + public void readGzipFile() throws IOException { + DirectoryManager submdm = mdm.getSubdirectoryManager("anotherdir"); + String content = MockStringContentFactory + .getStringFromInputStream(submdm.getInputStreamForFile( + "test.txt.gz", CompressionType.GZIP)); + assertEquals(content, "Test GZIP contents"); + } + @Test public void createFileFromInputStream() throws IOException { InputStream inputStream = MockStringContentFactory .newMockInputStream("New stream contents\nMultiple lines"); mdm.createFile("newfile.txt", inputStream); String content = MockStringContentFactory.getStringFromInputStream(mdm - .getInputStreamForFile("newfile.txt")); + .getInputStreamForFile("newfile.txt", CompressionType.NONE)); assertEquals(content, "New stream contents\nMultiple lines"); } @@ -121,7 +134,7 @@ public void createFileFromInputStream() throws IOException { public void createFileFromString() throws IOException { mdm.createFile("newfile.txt", "New contents"); String content = MockStringContentFactory.getStringFromInputStream(mdm - .getInputStreamForFile("newfile.txt")); + .getInputStreamForFile("newfile.txt", CompressionType.NONE)); assertEquals(content, "New contents"); } @@ -129,7 +142,8 @@ public void createFileFromString() throws IOException { public void readFileFails() throws IOException { mdm.setReturnFailingReaders(true); DirectoryManager submdm = mdm.getSubdirectoryManager("dir2"); - InputStream in = submdm.getInputStreamForFile("test.txt"); + InputStream in = submdm.getInputStreamForFile("test.txt", + CompressionType.NONE); // We do not use @Test(expected = IOException.class) in order to check // if the exception is really thrown at the right moment. boolean exception = false; @@ -149,18 +163,30 @@ public void createFileConflict() throws IOException { @Test(expected = FileNotFoundException.class) public void fileNotFound() throws IOException { - mdm.getInputStreamForFile("test.txt"); + mdm.getInputStreamForFile("test.txt", CompressionType.NONE); } @Test(expected = IllegalArgumentException.class) - public void readOnlyNonBz2Files() throws IOException { + public void bunzipOnlyBz2Files() throws IOException { DirectoryManager submdm = mdm.getSubdirectoryManager("dir2"); - submdm.getInputStreamForBz2File("test.txt"); + submdm.getInputStreamForFile("test.txt", CompressionType.BZ2); } @Test(expected = IllegalArgumentException.class) - public void bunzipOnlyBz2Files() throws IOException { + public void gunzipOnlyGzipFiles() throws IOException { + DirectoryManager submdm = mdm.getSubdirectoryManager("dir2"); + submdm.getInputStreamForFile("test.txt", CompressionType.GZIP); + } + + @Test(expected = IllegalArgumentException.class) + public void readOnlyNonBz2Files() throws IOException { + DirectoryManager submdm = mdm.getSubdirectoryManager("anotherdir"); + submdm.getInputStreamForFile("test.txt.bz2", CompressionType.NONE); + } + + @Test(expected = IllegalArgumentException.class) + public void readOnlyNonGzipFiles() throws IOException { DirectoryManager submdm = mdm.getSubdirectoryManager("anotherdir"); - submdm.getInputStreamForFile("test.txt.bz2"); + submdm.getInputStreamForFile("test.txt.gz", CompressionType.NONE); } } diff --git a/wdtk-testing/src/test/java/org/wikidata/wdtk/testing/MockWebResourceFetcherTest.java b/wdtk-testing/src/test/java/org/wikidata/wdtk/testing/MockWebResourceFetcherTest.java index c794e6442..c3a4900eb 100644 --- a/wdtk-testing/src/test/java/org/wikidata/wdtk/testing/MockWebResourceFetcherTest.java +++ b/wdtk-testing/src/test/java/org/wikidata/wdtk/testing/MockWebResourceFetcherTest.java @@ -37,9 +37,7 @@ public class MockWebResourceFetcherTest { public void setUp() throws Exception { mwrf = new MockWebResourceFetcher(); mwrf.setWebResourceContents("http://example.com/test.html", - "Line1\nLine2", MockWebResourceFetcher.TYPE_HTML); - mwrf.setWebResourceContents("http://example.com/test.gzip", - "Line1\nLine2", MockWebResourceFetcher.TYPE_GZIP); + "Line1\nLine2"); } @Test @@ -49,18 +47,11 @@ public void inputStreamForHtml() throws IOException { assertEquals(content, "Line1\nLine2"); } - @Test - public void inputStreamForGzip() throws IOException { - String content = MockStringContentFactory.getStringFromInputStream(mwrf - .getInputStreamForGzipUrl("http://example.com/test.gzip")); - assertEquals(content, "Line1\nLine2"); - } - @Test public void setConcentsFromResource() throws IOException { mwrf.setWebResourceContentsFromResource( "http://example.com/resource.html", "/test.txt", - MockWebResourceFetcher.TYPE_HTML, this.getClass()); + this.getClass()); String content = MockStringContentFactory.getStringFromInputStream(mwrf .getInputStreamForUrl("http://example.com/resource.html")); assertEquals(content, "This file is here\nto test resource loading."); @@ -82,11 +73,6 @@ public void inputStreamForHtmlFails() throws IOException { assertTrue(exception); } - @Test(expected = IllegalArgumentException.class) - public void readOnlyGzipFiles() throws IOException { - mwrf.getInputStreamForGzipUrl("http://example.com/test.html"); - } - @Test(expected = IOException.class) public void readOnlyMockedUrls() throws IOException { mwrf.getInputStreamForUrl("http://not-mocked.com"); diff --git a/wdtk-util/src/main/java/org/wikidata/wdtk/util/CompressionType.java b/wdtk-util/src/main/java/org/wikidata/wdtk/util/CompressionType.java new file mode 100644 index 000000000..c6a3a87cf --- /dev/null +++ b/wdtk-util/src/main/java/org/wikidata/wdtk/util/CompressionType.java @@ -0,0 +1,32 @@ +package org.wikidata.wdtk.util; + +/* + * #%L + * Wikidata Toolkit Utilities + * %% + * Copyright (C) 2014 Wikidata Toolkit Developers + * %% + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * #L% + */ + +/** + * Enum for denoting several basic file types for which we provide transparent + * decompression. + * + * @author Markus Kroetzsch + * + */ +public enum CompressionType { + NONE, GZIP, BZ2 +} diff --git a/wdtk-util/src/main/java/org/wikidata/wdtk/util/DirectoryManager.java b/wdtk-util/src/main/java/org/wikidata/wdtk/util/DirectoryManager.java index 6cecb9ea1..d87fccd97 100644 --- a/wdtk-util/src/main/java/org/wikidata/wdtk/util/DirectoryManager.java +++ b/wdtk-util/src/main/java/org/wikidata/wdtk/util/DirectoryManager.java @@ -95,30 +95,22 @@ long createFile(String fileName, InputStream inputStream) void createFile(String fileName, String fileContents) throws IOException; /** - * Returns an input stream to access the file of the given name within the - * current directory. + * Returns an input stream to access file of the given name within the + * current directory, possibly uncompressing it if required. *

* It is important to close the stream after using it to free memory. * * @param fileName * the name of the file + * @param compressionType + * for types other than {@link CompressionType#NONE}, the file + * will be uncompressed appropriately and the returned input + * stream will provide access to the uncompressed content * @return an InputStream to fetch data from the file * @throws IOException */ - InputStream getInputStreamForFile(String fileName) throws IOException; - - /** - * Returns an input stream to access the BZIP2-compressed file of the given - * name within the current directory. - *

- * It is important to close the stream after using it to free memory. - * - * @param fileName - * the name of the file - * @return an InputStream to fetch data from the file - * @throws IOException - */ - InputStream getInputStreamForBz2File(String fileName) throws IOException; + InputStream getInputStreamForFile(String fileName, + CompressionType compressionType) throws IOException; /** * Returns a list of the names of all subdirectories of the base directory. diff --git a/wdtk-util/src/main/java/org/wikidata/wdtk/util/DirectoryManagerImpl.java b/wdtk-util/src/main/java/org/wikidata/wdtk/util/DirectoryManagerImpl.java index 91bdeecf5..81beaf5a7 100644 --- a/wdtk-util/src/main/java/org/wikidata/wdtk/util/DirectoryManagerImpl.java +++ b/wdtk-util/src/main/java/org/wikidata/wdtk/util/DirectoryManagerImpl.java @@ -36,6 +36,7 @@ import java.nio.file.StandardOpenOption; import java.util.ArrayList; import java.util.List; +import java.util.zip.GZIPInputStream; import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; @@ -130,18 +131,24 @@ public void createFile(String fileName, String fileContents) } @Override - public InputStream getInputStreamForFile(String fileName) - throws IOException { + public InputStream getInputStreamForFile(String fileName, + CompressionType compressionType) throws IOException { Path filePath = this.directory.resolve(fileName); - return Files.newInputStream(filePath, StandardOpenOption.READ); - } + InputStream fileInputStream = Files.newInputStream(filePath, + StandardOpenOption.READ); + switch (compressionType) { + case NONE: + return fileInputStream; + case GZIP: + return new GZIPInputStream(fileInputStream); + case BZ2: + return new BZip2CompressorInputStream(new BufferedInputStream( + fileInputStream)); + default: + throw new IllegalArgumentException("Unsupported compresion type: " + + compressionType); + } - @Override - public InputStream getInputStreamForBz2File(String fileName) - throws IOException { - Path filePath = this.directory.resolve(fileName); - return new BZip2CompressorInputStream(new BufferedInputStream( - Files.newInputStream(filePath, StandardOpenOption.READ))); } @Override diff --git a/wdtk-util/src/main/java/org/wikidata/wdtk/util/WebResourceFetcher.java b/wdtk-util/src/main/java/org/wikidata/wdtk/util/WebResourceFetcher.java index 26edfc151..98eeb3fd1 100644 --- a/wdtk-util/src/main/java/org/wikidata/wdtk/util/WebResourceFetcher.java +++ b/wdtk-util/src/main/java/org/wikidata/wdtk/util/WebResourceFetcher.java @@ -32,37 +32,6 @@ */ public interface WebResourceFetcher { - /** - * Returns a BufferedReader for the document at the given URL. The reader - * should be closed after use. The bytes found at the given URL will be - * interpreted as UTF-8 for this operation. - * - * @param urlString - * the URL of the document - * @return BufferedReader for the requested document - * @throws IOException - * if the document at the URL could not be opended or the URL - * was invalid - */ - // BufferedReader getBufferedReaderForUrl(String urlString) throws - // IOException; - - /** - * Returns a BufferedReader for the Gzip-compressed document at the given - * URL. The reader should be closed after use. The bytes found in the - * gzipped file at the given URL will be interpreted as UTF-8 for this - * operation. - * - * @param urlString - * the URL of the gzipped document - * @return BufferedReader for the requested document - * @throws IOException - * if the document at the URL could not be opended or the URL - * was invalid - */ - // BufferedReader getBufferedReaderForGzipUrl(String urlString) - // throws IOException; - /** * Returns an InputStream for the document at the given URL. This can be * used for downloading. The stream should be closed after use. @@ -76,17 +45,4 @@ public interface WebResourceFetcher { */ InputStream getInputStreamForUrl(String urlString) throws IOException; - /** - * Returns an InputStream for the Gzip-compressed document at the given URL. - * This can be used for downloading. The stream should be closed after use. - * - * @param urlString - * the URL of the document - * @return InputStream for the requested document - * @throws IOException - * if the document at the URL could not be opened or the URL was - * invalid - */ - InputStream getInputStreamForGzipUrl(String urlString) throws IOException; - } \ No newline at end of file diff --git a/wdtk-util/src/main/java/org/wikidata/wdtk/util/WebResourceFetcherImpl.java b/wdtk-util/src/main/java/org/wikidata/wdtk/util/WebResourceFetcherImpl.java index ec3ec009c..9803f1b48 100644 --- a/wdtk-util/src/main/java/org/wikidata/wdtk/util/WebResourceFetcherImpl.java +++ b/wdtk-util/src/main/java/org/wikidata/wdtk/util/WebResourceFetcherImpl.java @@ -23,7 +23,6 @@ import java.io.IOException; import java.io.InputStream; import java.net.URL; -import java.util.zip.GZIPInputStream; /** * Standard implementation of {@link WebResourceFetcher}. @@ -32,22 +31,6 @@ * */ public class WebResourceFetcherImpl implements WebResourceFetcher { - - // @Override - // public BufferedReader getBufferedReaderForUrl(String urlString) - // throws IOException { - // return new BufferedReader(new InputStreamReader( - // this.getInputStreamForUrl(urlString), StandardCharsets.UTF_8)); - // } - // - // @Override - // public BufferedReader getBufferedReaderForGzipUrl(String urlString) - // throws IOException { - // return new BufferedReader(new InputStreamReader( - // this.getInputStreamForGzipUrl(urlString), - // StandardCharsets.UTF_8)); - // } - @Override public InputStream getInputStreamForUrl(String urlString) throws IOException { @@ -55,11 +38,4 @@ public InputStream getInputStreamForUrl(String urlString) return url.openStream(); } - @Override - public InputStream getInputStreamForGzipUrl(String urlString) - throws IOException { - URL url = new URL(urlString); - return new GZIPInputStream(url.openStream()); - } - } diff --git a/wdtk-util/src/test/java/org/wikidata/wdtk/util/TimerTest.java b/wdtk-util/src/test/java/org/wikidata/wdtk/util/TimerTest.java index cc9edffcc..54d2b6e19 100644 --- a/wdtk-util/src/test/java/org/wikidata/wdtk/util/TimerTest.java +++ b/wdtk-util/src/test/java/org/wikidata/wdtk/util/TimerTest.java @@ -35,6 +35,13 @@ public class TimerTest { + /** + * Base value for the time in microseconds that we allow between our + * measured times and what the timer returns. In theory, there is not really + * any such time but in practice a sufficiently high value should work. + */ + static final int TIME_TOLERANCE = 200000; + /** * Spend some time computing to be able to measure something. * @@ -86,12 +93,14 @@ public void basicTimerOperation() { assertTrue( "Unrealistic CPU time: " + timer.getTotalCpuTime() + " should be closer to " + cpuTime1, - (cpuTime1 - 100000) <= timer.getTotalCpuTime() + (cpuTime1 - TimerTest.TIME_TOLERANCE) <= timer + .getTotalCpuTime() && timer.getTotalCpuTime() <= cpuTime1); assertTrue( "Unrealistic wall time: " + timer.getTotalWallTime() + " should be closer to " + wallTime1, - (wallTime1 - 200000) <= timer.getTotalWallTime() + (wallTime1 - 2 * TimerTest.TIME_TOLERANCE) <= timer + .getTotalWallTime() && timer.getTotalWallTime() <= wallTime1); long cpuTime2 = tmxb.getThreadCpuTime(threadId); @@ -104,12 +113,14 @@ public void basicTimerOperation() { assertTrue( "Unrealistic total CPU time: " + timer.getTotalCpuTime() + " should be closer to " + cpuTime1, - (cpuTime1 - 200000) <= timer.getTotalCpuTime() + (cpuTime1 - 2 * TimerTest.TIME_TOLERANCE) <= timer + .getTotalCpuTime() && timer.getTotalCpuTime() <= cpuTime1); assertTrue( "Unrealistic total wall time: " + timer.getTotalWallTime() + " should be closer to " + wallTime1, - (wallTime1 - 400000) <= timer.getTotalWallTime() + (wallTime1 - 4 * TimerTest.TIME_TOLERANCE) <= timer + .getTotalWallTime() && timer.getTotalWallTime() <= wallTime1); assertEquals(timer.getTotalCpuTime() / 2, timer.getAvgCpuTime());