Skip to content

Commit

Permalink
Fixed Catalogue Federation through DCAT Dump in TURTLE format.
Browse files Browse the repository at this point in the history
  • Loading branch information
emastrosimone committed Sep 5, 2018
1 parent 94e9ab4 commit 692dc3c
Show file tree
Hide file tree
Showing 8 changed files with 78 additions and 37 deletions.
12 changes: 12 additions & 0 deletions Idra/src/main/java/it/eng/idra/beans/odms/ODMSCatalogue.java
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,10 @@ public class ODMSCatalogue {
// @Transient
private DCATAPProfile dcatProfile;

@Transient
private DCATAPFormat dcatFormat;


public ODMSCatalogue() {
this.setSynchLock(ODMSSynchLock.NONE);
this.location = "";
Expand Down Expand Up @@ -499,6 +503,14 @@ public void setDCATProfile(DCATAPProfile profile) {
this.dcatProfile = profile;
}

public DCATAPFormat getDcatFormat() {
return dcatFormat;
}

public void setDcatFormat(DCATAPFormat dcatFormat) {
this.dcatFormat = dcatFormat;
}

public String getCountry() {
return country;
}
Expand Down
38 changes: 24 additions & 14 deletions Idra/src/main/java/it/eng/idra/connectors/DCATDumpConnector.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,31 +22,21 @@
import java.util.HashMap;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.ResponseHandler;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.apache.jena.rdf.model.Model;
import org.apache.jena.rdf.model.Resource;
import org.apache.jena.riot.RiotException;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import it.eng.idra.beans.ODFProperty;
import it.eng.idra.beans.dcat.DCATAPFormat;
import it.eng.idra.beans.dcat.DCATAPProfile;
import it.eng.idra.beans.dcat.DCATDataset;
import it.eng.idra.beans.odms.ODMSCatalogue;
import it.eng.idra.beans.odms.ODMSSynchronizationResult;
Expand Down Expand Up @@ -163,22 +153,42 @@ private List<DCATDataset> getDatasetsFromDumpString(String dumpString) throws Ex
List<DCATDataset> datasetsList = new ArrayList<DCATDataset>();

// Pass the Node Host as base URI for the model
Model m = deserializer.dumpToModel(dumpString, node.getHost());

Matcher matcher = deserializer.getDatasetPattern().matcher(dumpString);
Model m = deserializer.dumpToModel(dumpString, node);
Matcher matcher = deserializer.getDatasetPattern(node.getDcatFormat()).matcher(dumpString);
String datasetURI = null;
int hits = 0;
while (matcher.find()) {

datasetURI = null;
try {
if ((datasetURI = matcher.group(2)) != null) {

switch (node.getDcatFormat()) {

case TURTLE:
datasetURI = matcher.group(1);
break;

// RDFXML is the default
default:
datasetURI = matcher.group(1);
break;

}

if (StringUtils.isNotBlank(datasetURI)) {

Resource r = m.getResource(datasetURI);
datasetsList.add(deserializer.resourceToDataset(nodeID, r));
}

} catch (Exception e) {
logger.info("Skipped dataset - There was an error: " + e.getMessage() + " while deserializing dataset: "
+ datasetURI);
System.out.println(hits++);
}
}


if (datasetsList.size() != 0) {
DCATAPSerializer.writeModelToFile(m, DCATAPFormat.RDFXML, odmsDumpFilePath, "dumpFileString_" + nodeID);
node.setDumpFilePath(odmsDumpFilePath + "dumpFileString_" + nodeID);
Expand Down
41 changes: 30 additions & 11 deletions Idra/src/main/java/it/eng/idra/dcat/dump/DCATAPDeserializer.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import org.apache.jena.rdf.model.Model;
import org.apache.jena.rdf.model.ModelFactory;
import org.apache.jena.rdf.model.Property;
import org.apache.jena.rdf.model.RDFNode;
import org.apache.jena.rdf.model.Resource;
import org.apache.jena.rdf.model.ResourceFactory;
import org.apache.jena.rdf.model.ResourceRequiredException;
Expand Down Expand Up @@ -63,27 +64,34 @@
import it.eng.idra.beans.dcat.SKOSPrefLabel;
import it.eng.idra.beans.dcat.SPDXChecksum;
import it.eng.idra.beans.dcat.VCardOrganization;
import it.eng.idra.beans.odms.ODMSCatalogue;
import it.eng.idra.utils.CommonUtil;

public class DCATAPDeserializer implements IDCATAPDeserialize {

private static final Pattern datasetPattern = Pattern.compile("\\w*<dcat:Dataset rdf:about=\\\"(.*)\\\"");
private static final Pattern rdfDatasetPattern = Pattern.compile("\\w*<dcat:Dataset rdf:about=\\\"(.*)\\\"");
private static final Pattern turtleDatasetPattern = Pattern.compile("<(.*)>\\R\\s*a dcat:Dataset");

private static final String GEO_BASE_URI = "http://publications.europa.eu/mdr/authority/place";
private static final String GEO_BASE_URI_ALT = "http://www.geonames.org";

public DCATAPDeserializer() {
}

public Model dumpToModel(String modelText, String nodeBaseURI) throws RiotException {
public Model dumpToModel(String modelText, ODMSCatalogue node) throws RiotException {

String nodeBaseURI = node.getHost();
// create an empty model
Model model = ModelFactory.createDefaultModel();
for (DCATAPFormat format : DCATAPFormat.values()) {
try {
model.read(new ByteArrayInputStream(modelText.getBytes(StandardCharsets.UTF_8)), nodeBaseURI,
format.formatName());
node.setDcatFormat(format);
break;
} catch (RiotException e) {
if (!e.getMessage().contains("Content is not allowed in prolog"))
if (!e.getMessage().contains("Content is not allowed in prolog") && !e.getMessage()
.contains("[line: 1, col: 1 ] Expected BNode or IRI: Got: [DIRECTIVE:prefix]"))
throw e;
else
continue;
Expand Down Expand Up @@ -201,7 +209,12 @@ public DCATDataset resourceToDataset(String nodeID, Resource datasetResource)
// Iterate over source properties
StmtIterator sourceIt = datasetResource.listProperties(DCTerms.source);
while (sourceIt.hasNext()) {
source.add(sourceIt.next().getString());
Statement sourceStm = sourceIt.next();
try {
source.add(sourceStm.getString());
} catch (LiteralRequiredException e) {
source.add(sourceStm.getResource().getURI());
}
}

// Handle spatial property
Expand Down Expand Up @@ -562,10 +575,9 @@ public FOAFAgent deserializeFOAFAgent(String nodeID, Statement agentStatement) {

String agentIdentifier = null, agentUri = null, agentName = null, agentMbox = null, agentHomepage = null,
agentType = null;
Resource agentResource = null;

Resource agentResource = agentStatement.getResource();

if (agentResource != null) {
if (agentStatement != null && (agentResource = agentStatement.getResource()) != null) {

agentUri = agentResource.getURI();
if (agentResource.hasProperty(FOAF.name))
Expand Down Expand Up @@ -682,9 +694,9 @@ public DCATDistribution resourceToDCATDistribution(Resource r, String nodeID) {
mediaType = r.getProperty(DCAT.mediaType).getString();

if (r.hasProperty(DCTerms.issued))
releaseDate = CommonUtil.fixBadUTCDate(r.getProperty(DCTerms.issued).getString());
releaseDate = extractDate(r.getProperty(DCTerms.issued));
if (r.hasProperty(DCTerms.modified))
updateDate = CommonUtil.fixBadUTCDate(r.getProperty(DCTerms.modified).getString());
updateDate = extractDate(r.getProperty(DCTerms.modified));
if (r.hasProperty(DCTerms.rights))
rights = r.getProperty(DCTerms.rights).getString();

Expand Down Expand Up @@ -781,8 +793,15 @@ public String extractLanguageFromURI(String uri) {

}

public Pattern getDatasetPattern() {
return datasetPattern;
public Pattern getDatasetPattern(DCATAPFormat format) {

switch (format) {

case TURTLE:
return turtleDatasetPattern;
default:
return rdfDatasetPattern;
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -263,9 +263,9 @@ public DCTPeriodOfTime deserializeTemporalCoverage(String nodeID, Resource datas
return null;
}

@Override
public Pattern getDatasetPattern() {
return datasetPattern;
}
// @Override
// public Pattern getDatasetPattern() {
// return datasetPattern;
// }

}
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import org.apache.jena.rdf.model.Statement;
import org.apache.jena.riot.RiotException;

import it.eng.idra.beans.dcat.DCATAPFormat;
import it.eng.idra.beans.dcat.DCATAPProfileNotValidException;
import it.eng.idra.beans.dcat.DCATDataset;
import it.eng.idra.beans.dcat.DCATDistribution;
Expand All @@ -36,10 +37,11 @@
import it.eng.idra.beans.dcat.SKOSConcept;
import it.eng.idra.beans.dcat.SPDXChecksum;
import it.eng.idra.beans.dcat.VCardOrganization;
import it.eng.idra.beans.odms.ODMSCatalogue;

public interface IDCATAPDeserialize {

public Model dumpToModel(String modelText, String baseURI) throws RiotException;
public Model dumpToModel(String modelText, ODMSCatalogue node) throws RiotException;

public DCATDataset resourceToDataset(String nodeID, Resource datasetResource) throws DCATAPProfileNotValidException;

Expand Down Expand Up @@ -73,6 +75,6 @@ public interface IDCATAPDeserialize {

String extractLanguageFromURI(String uri);

Pattern getDatasetPattern();
Pattern getDatasetPattern(DCATAPFormat format);

}
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@
import it.eng.idra.beans.odms.ODMSSynchLock;
import it.eng.idra.cache.CachePersistenceManager;
import it.eng.idra.cache.MetadataCacheManager;
import it.eng.idra.dcat.dump.DCATAPDumpManager;
import it.eng.idra.odfscheduler.ODFScheduler;
import it.eng.idra.odfscheduler.SchedulerNotInitialisedException;
import it.eng.idra.search.EuroVocTranslator;
Expand Down
5 changes: 2 additions & 3 deletions Idra/src/main/java/it/eng/idra/management/ODMSManager.java
Original file line number Diff line number Diff line change
Expand Up @@ -368,16 +368,15 @@ public static int addODMSCatalogue(ODMSCatalogue node) throws ODMSAlreadyPresent
switch (node.getDCATProfile()) {

case DCATAP_IT:
m = new DCATAPITDeserializer().dumpToModel(node.getDumpString(), node.getHost());
m = new DCATAPITDeserializer().dumpToModel(node.getDumpString(), node);
break;
default:
// If no profile was provided, instantiate a base DCATAP Deserializer
m = new DCATAPDeserializer().dumpToModel(node.getDumpString(), node.getHost());
m = new DCATAPDeserializer().dumpToModel(node.getDumpString(), node);
break;

}

// Model m = DCATAPDeserializer.dumpToModel(node.getDumpString());
String odmsDumpFilePath = PropertyManager.getProperty(ODFProperty.ODMS_DUMP_FILE_PATH);
try {
DCATAPSerializer.writeModelToFile(m, DCATAPFormat.RDFXML, odmsDumpFilePath,
Expand Down
4 changes: 2 additions & 2 deletions Idra/src/main/java/it/eng/idra/utils/CommonUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ public class CommonUtil {
private static Logger logger = LogManager.getLogger(CommonUtil.class);
private static DateTimeFormatter dtFormatter = DateTimeFormatter.ISO_OFFSET_DATE_TIME.withZone(ZoneOffset.UTC);
private static String[] dateFormats = { "dd/MM/yyyy", "yyyy-MM-dd", "EEE MMM dd HH:mm:ss zzz yyyy",
"EEEE dd MMMM yyyy", "dd MMMM yyyy", "" };
"EEEE dd MMMM yyyy", "dd MMMM yyyy", "yyyy-MM-dd'T'HH:mm:ss[XXX][X]" };

public static Ordering<ODMSCatalogue> idOrder = new Ordering<ODMSCatalogue>() {
public int compare(ODMSCatalogue one, ODMSCatalogue other) {
Expand Down Expand Up @@ -164,7 +164,7 @@ public static String toUtcDate(String dateString) throws IllegalArgumentExceptio
public static String fromLocalToUtcDate(String originalDateString, Locale locale) {

if (StringUtils.isNotBlank(originalDateString)) {
String dateString = originalDateString.toLowerCase();
String dateString = originalDateString.toUpperCase();

if (locale == null) {
Locale[] locales = Locale.getAvailableLocales();
Expand Down

0 comments on commit 692dc3c

Please sign in to comment.