diff --git a/components-core/src/main/java/org/dllearner/algorithms/qtl/QTL2Disjunctive.java b/components-core/src/main/java/org/dllearner/algorithms/qtl/QTL2Disjunctive.java index 662e358ca0..4584f1b5fc 100644 --- a/components-core/src/main/java/org/dllearner/algorithms/qtl/QTL2Disjunctive.java +++ b/components-core/src/main/java/org/dllearner/algorithms/qtl/QTL2Disjunctive.java @@ -460,7 +460,7 @@ private EvaluatedRDFResourceTree computeBestPartialSolution(){ ((LGGGeneratorSimple)lggGenerator).setTimeout(getRemainingPartialSolutionTime(), TimeUnit.SECONDS); RDFResourceTree lgg = lggGenerator.getLGG(currentTree, uncoveredTree); MonitorFactory.getTimeMonitor("lgg").stop(); - System.out.println("COMPLETE:" + ((LGGGeneratorSimple)lggGenerator).isComplete()); +// System.out.println("COMPLETE:" + ((LGGGeneratorSimple)lggGenerator).isComplete()); // logger.info("LGG: " + lgg.getStringRepresentation()); // redundancy check @@ -1071,15 +1071,6 @@ public void setNoise(double noise) { this.noise = noise; } - /** - * @param maxExecutionTimeInSeconds the maximum execution time in seconds until the - * algorithm will terminate gracefully - */ - @Override - public void setMaxExecutionTimeInSeconds(int maxExecutionTimeInSeconds) { - this.maxExecutionTimeInSeconds = maxExecutionTimeInSeconds; - } - /** * Default value is 1. Lower values force importance of covering positive examples. * @param beta the beta to set @@ -1089,7 +1080,10 @@ public void setBeta(double beta) { } /** - * @param maxTreeComputationTimeInSeconds the maxTreeComputationTimeInSeconds to set + * Set the max. execution time for the computation of a partial solution. If this value isn't set, the + * max. algorithm runtime will be used, thus, in worst case only one partial solution was computed. + * + * @param maxTreeComputationTimeInSeconds the max. computation for a partial solution tree */ public void setMaxTreeComputationTimeInSeconds(double maxTreeComputationTimeInSeconds) { this.maxTreeComputationTimeInSeconds = maxTreeComputationTimeInSeconds; @@ -1174,7 +1168,7 @@ public void setMaxTreeDepth(int maxTreeDepth) { } /** - * @return the runtime until the best solution was found + * @return the runtime in ms until the best solution was found */ public long getTimeBestSolutionFound() { return timeBestSolutionFound; diff --git a/components-core/src/main/java/org/dllearner/algorithms/qtl/QTL2DisjunctiveMultiThreaded.java b/components-core/src/main/java/org/dllearner/algorithms/qtl/QTL2DisjunctiveMultiThreaded.java new file mode 100644 index 0000000000..ee9cf026e3 --- /dev/null +++ b/components-core/src/main/java/org/dllearner/algorithms/qtl/QTL2DisjunctiveMultiThreaded.java @@ -0,0 +1,1259 @@ +/** + * Copyright (C) 2007 - 2016, Jens Lehmann + * + * This file is part of DL-Learner. + * + * DL-Learner is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * DL-Learner is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.dllearner.algorithms.qtl; + +import com.google.common.collect.BiMap; +import com.google.common.collect.HashBiMap; +import com.google.common.collect.Sets; +import com.jamonapi.MonitorFactory; +import gnu.trove.map.TObjectIntMap; +import gnu.trove.map.hash.TObjectIntHashMap; +import org.aksw.jena_sparql_api.core.QueryExecutionFactory; +import org.apache.jena.rdf.model.Model; +import org.apache.jena.rdf.model.ModelFactory; +import org.dllearner.algorithms.qtl.datastructures.impl.EvaluatedRDFResourceTree; +import org.dllearner.algorithms.qtl.datastructures.impl.QueryTreeImpl.LiteralNodeConversionStrategy; +import org.dllearner.algorithms.qtl.datastructures.impl.QueryTreeImpl.LiteralNodeSubsumptionStrategy; +import org.dllearner.algorithms.qtl.datastructures.impl.RDFResourceTree; +import org.dllearner.algorithms.qtl.heuristics.QueryTreeHeuristic; +import org.dllearner.algorithms.qtl.heuristics.QueryTreeHeuristicSimple; +import org.dllearner.algorithms.qtl.impl.QueryTreeFactory; +import org.dllearner.algorithms.qtl.impl.QueryTreeFactoryBase; +import org.dllearner.algorithms.qtl.operations.lgg.LGGGenerator; +import org.dllearner.algorithms.qtl.operations.lgg.LGGGeneratorExt; +import org.dllearner.algorithms.qtl.operations.lgg.LGGGeneratorRDFS; +import org.dllearner.algorithms.qtl.operations.lgg.LGGGeneratorSimple; +import org.dllearner.algorithms.qtl.util.Entailment; +import org.dllearner.algorithms.qtl.util.filters.PredicateExistenceFilterDBpedia; +import org.dllearner.core.*; +import org.dllearner.core.StringRenderer.Rendering; +import org.dllearner.core.config.ConfigOption; +import org.dllearner.kb.OWLAPIOntology; +import org.dllearner.kb.OWLFile; +import org.dllearner.kb.SparqlEndpointKS; +import org.dllearner.kb.sparql.ConciseBoundedDescriptionGenerator; +import org.dllearner.kb.sparql.ConciseBoundedDescriptionGeneratorImpl; +import org.dllearner.learningproblems.Heuristics; +import org.dllearner.learningproblems.PosNegLP; +import org.dllearner.learningproblems.QueryTreeScore; +import org.semanticweb.owlapi.model.OWLClassExpression; +import org.semanticweb.owlapi.model.OWLIndividual; +import org.semanticweb.owlapi.util.SimpleShortFormProvider; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.text.DecimalFormat; +import java.util.*; +import java.util.Map.Entry; +import java.util.concurrent.*; +import java.util.stream.Collectors; + +/** + * A tree-based algorithm ... \todo add explanation + */ +@ComponentAnn(name="query tree learner with noise (disjunctive) - multi-threaded", shortName="qtl2dismt", version=0.8) +public class QTL2DisjunctiveMultiThreaded extends AbstractCELA implements Cloneable{ + + private static final Logger logger = LoggerFactory.getLogger(QTL2DisjunctiveMultiThreaded.class); + private final DecimalFormat dFormat = new DecimalFormat("0.00"); + + private SparqlEndpointKS ks; + +// private LGGGenerator2 lggGenerator = new LGGGeneratorSimple(); + private LGGGenerator lggGenerator; + + private QueryTreeFactory treeFactory; + private ConciseBoundedDescriptionGenerator cbdGen; + + private Queue todoList; + private SortedSet currentPartialSolutions = new ConcurrentSkipListSet<>(); + + private double bestCurrentScore = 0d; + private EvaluatedRDFResourceTree bestPartialSolutionTree; + + private List currentPosExampleTrees = new ArrayList<>(); + private List currentNegExampleTrees = new ArrayList<>(); + private Set currentPosExamples = new HashSet<>(); + private Set currentNegExamples = new HashSet<>(); + + private BiMap tree2Individual = HashBiMap.create(); + + private PosNegLP lp; + + private Model model; + + private volatile boolean stop; + private boolean isRunning; + + private List partialSolutions; + + private EvaluatedDescription currentBestSolution; + + private QueryTreeHeuristic heuristic; + + //Parameters + @ConfigOption(defaultValue="0.0", description="the (approximated) percentage of noise within the examples") + private double noisePercentage = 0.0; + + private double coverageWeight = 0.8; + private double specifityWeight = 0.1; + + private double minCoveredPosExamplesFraction = 0.2; + + // maximum execution time to compute a part of the solution + private double maxTreeComputationTimeInSeconds = 10; + + @ConfigOption(defaultValue = "1", description = "how important it is not to cover negatives") + private double beta = 1; + + // minimum score a query tree must have to be part of the solution + private double minimumTreeScore = 0.3; + + // If TRUE the algorithm tries to cover all positive examples. Note that + // while this improves accuracy on the testing set, + // it may lead to overfitting + private boolean tryFullCoverage; + + // algorithm will terminate immediately when a correct definition is found + private boolean stopOnFirstDefinition; + + // the (approximated) value of noise within the examples + private double noise = 0.0; + + private long partialSolutionStartTime; + + private double startPosExamplesSize; + private int expressionTests = 0; + + // the time needed until the best solution was found + private long timeBestSolutionFound = -1; + + LiteralNodeConversionStrategy[] strategies = new LiteralNodeConversionStrategy[]{ + LiteralNodeConversionStrategy.MIN, + LiteralNodeConversionStrategy.MAX, + LiteralNodeConversionStrategy.MIN_MAX, + LiteralNodeConversionStrategy.DATATYPE + }; + private QueryExecutionFactory qef; + + private Entailment entailment = Entailment.SIMPLE; + + private int maxTreeDepth = 2; + + private boolean useDisjunction = false; + + private int nrOfThreads = Runtime.getRuntime().availableProcessors(); + + public QTL2DisjunctiveMultiThreaded() {} + + public QTL2DisjunctiveMultiThreaded(PosNegLP learningProblem, AbstractReasonerComponent reasoner) { + super(learningProblem, reasoner); + loadModel(); + } + + public QTL2DisjunctiveMultiThreaded(PosNegLP lp, QueryExecutionFactory qef) { + super.learningProblem = lp; + this.lp = lp; + this.qef = qef; + } + + public QTL2DisjunctiveMultiThreaded(PosNegLP lp, SparqlEndpointKS ks) { + this(lp, ks.getQueryExecutionFactory()); + } + +// public QTL2Disjunctive(PosNegLP lp, Model model) { +// this.learningProblem = lp; +// this.model = model; +// } + + /** + * Copy constructor. + * @param qtl the QTL2Disjunctive instance + */ + public QTL2DisjunctiveMultiThreaded(QTL2DisjunctiveMultiThreaded qtl) { + super(qtl.getLearningProblem(), qtl.getReasoner()); + this.model = ModelFactory.createDefaultModel(); + this.model.add(qtl.model); + this.beta = qtl.beta; + this.maxExecutionTimeInSeconds = qtl.maxExecutionTimeInSeconds; + this.maxTreeComputationTimeInSeconds = qtl.maxTreeComputationTimeInSeconds; + this.tryFullCoverage = qtl.tryFullCoverage; + this.stopOnFirstDefinition = qtl.stopOnFirstDefinition; + } + + /* (non-Javadoc) + * @see org.dllearner.core.Component#init() + */ + @Override + public void init() throws ComponentInitException { + logger.info("Initializing..."); + if(!(learningProblem instanceof PosNegLP)){ + throw new IllegalArgumentException("Only PosNeg learning problems are supported"); + } + lp = (PosNegLP) learningProblem; + + // get query execution factory from KS + if(qef == null) { + qef = ks.getQueryExecutionFactory(); + } + + if(treeFactory == null) { + treeFactory = new QueryTreeFactoryBase(); + } + cbdGen = new ConciseBoundedDescriptionGeneratorImpl(qef); + + // set the used heuristic + if(heuristic == null){ + heuristic = new QueryTreeHeuristicSimple(); + heuristic.setPosExamplesWeight(beta); + } + + if(entailment == Entailment.SIMPLE) { + lggGenerator = new LGGGeneratorSimple(); +// lggGenerator = new LGGGeneratorExt(); +// ((LGGGeneratorExt)lggGenerator).setTreeFilters(Sets.newHashSet(new PredicateExistenceFilterDBpedia(null))); + } else if(entailment == Entailment.RDFS){ + lggGenerator = new LGGGeneratorRDFS(reasoner); + } + + // generate the query trees + generateQueryTrees(); + + startPosExamplesSize = currentPosExampleTrees.size(); + + //console rendering of class expressions + StringRenderer.setRenderer(Rendering.MANCHESTER_SYNTAX); + StringRenderer.setShortFormProvider(new SimpleShortFormProvider()); + + //compute the LGG for all examples + //this allows us to prune all other trees because we can omit paths in trees which are contained in all positive + //as well as negative examples +// List allExamplesTrees = new ArrayList(); +// allExamplesTrees.addAll(currentPosExampleTrees); +// allExamplesTrees.addAll(currentNegExampleTrees); +// RDFResourceTree lgg = lggGenerator.getLGG(allExamplesTrees); +// lgg.dump(); + logger.info("...initialization finished."); + } + + /** + * @param entailment the entailment to set + */ + public void setEntailment(Entailment entailment) { + this.entailment = entailment; + } + + public void setNrOfThreads(int nrOfThreads) { + this.nrOfThreads = nrOfThreads; + } + + private void generateQueryTrees(){ + logger.info("Generating trees..."); + RDFResourceTree queryTree; + + // positive examples + if(currentPosExampleTrees.isEmpty()){ + for (OWLIndividual ind : lp.getPositiveExamples()) { + try { + Model cbd = cbdGen.getConciseBoundedDescription(ind.toStringID(), maxTreeDepth); +// cbd.write(new FileOutputStream("/tmp/dbpedia-" + ind.toStringID().substring(ind.toStringID().lastIndexOf('/') + 1) + ".ttl"), "TURTLE", null); + queryTree = treeFactory.getQueryTree(ind.toStringID(), cbd, maxTreeDepth); + tree2Individual.put(queryTree, ind); + currentPosExampleTrees.add(queryTree); + currentPosExamples.add(ind); + logger.debug(ind.toStringID()); + logger.debug(queryTree.getStringRepresentation()); + } catch (Exception e) { + logger.error("Failed to generate tree for resource " + ind, e); + throw new RuntimeException(); + } + } + } + + // negative examples + if(currentNegExampleTrees.isEmpty()){ + for (OWLIndividual ind : lp.getNegativeExamples()) { + try { + Model cbd = cbdGen.getConciseBoundedDescription(ind.toStringID(), maxTreeDepth); + queryTree = treeFactory.getQueryTree(ind.toStringID(), cbd, maxTreeDepth); + tree2Individual.put(queryTree, ind); + currentNegExampleTrees.add(queryTree); + currentNegExamples.add(ind); + logger.debug(ind.toStringID()); + logger.debug(queryTree.getStringRepresentation()); + } catch (Exception e) { + logger.error("Failed to generate tree for resource " + ind, e); + throw new RuntimeException(); + } + } + } + logger.info("...done."); + } + + /* (non-Javadoc) + * @see org.dllearner.core.LearningAlgorithm#start() + */ + @Override + public void start() { + if(currentPosExampleTrees.isEmpty()) { + logger.info("No positive examples given!"); + return; + } + + printSetup(); + + reset(); + logger.info("Running..."); + nanoStartTime = System.nanoTime(); + + // if noise=0 and there are no neg. examples, we can simply compute the LGG for all pos. examples + if(noise == 0 && currentNegExampleTrees.isEmpty()) { + lggGenerator.setTimeout(getRemainingRuntimeInMilliseconds(), TimeUnit.MILLISECONDS); + RDFResourceTree lgg = lggGenerator.getLGG(currentPosExampleTrees); + Set solutions = evaluate(lgg, false); + timeBestSolutionFound = getCurrentRuntimeInMilliSeconds(); + currentPartialSolutions.addAll(solutions); + partialSolutions.addAll(solutions); + currentBestSolution = solutions.iterator().next().asEvaluatedDescription(); + } else { + int i = 1; + while(!terminationCriteriaSatisfied() && (useDisjunction || i == 1)){ + logger.info(i++ + ". iteration..."); + logger.info("#Remaining pos. examples:" + currentPosExampleTrees.size()); + logger.info("#Remaining neg. examples:" + currentNegExampleTrees.size()); + + // compute best (partial) solution computed so far + EvaluatedRDFResourceTree bestPartialSolution = computeBestPartialSolution(); + + // add to set of partial solutions if criteria are satisfied + if(bestPartialSolution.getScore() >= minimumTreeScore){ + + partialSolutions.add(bestPartialSolution); + + // remove all examples covered by current partial solution + RDFResourceTree tree; + for (Iterator iterator = currentPosExampleTrees.iterator(); iterator.hasNext();) { + tree = iterator.next(); + if(!bestPartialSolution.getFalseNegatives().contains(tree)){//a pos tree that is not covered + iterator.remove(); + currentPosExamples.remove(tree2Individual.get(tree)); + } + } + for (Iterator iterator = currentNegExampleTrees.iterator(); iterator.hasNext();) { + tree = iterator.next(); + if(bestPartialSolution.getFalsePositives().contains(tree)){//a neg example that is covered + iterator.remove(); + currentNegExamples.remove(tree2Individual.get(tree)); + } + } + + // (re)build the current combined solution from all partial solutions + currentBestSolution = buildCombinedSolution(); + + logger.info("combined accuracy: " + dFormat.format(currentBestSolution.getAccuracy())); + } else { + String message = "No partial tree found which satisfies the minimal criteria."; + if(currentBestSolution != null) { + message += "- the best was: " + + currentBestSolution.getDescription() + + " with score " + currentBestSolution.getScore(); + } + logger.info(message); + } + + } + } + + isRunning = false; + + postProcess(); + + long nanoEndTime = System.nanoTime(); + logger.info("Finished in {}ms.", getCurrentRuntimeInMilliSeconds()); + logger.info("{} descriptions tested", expressionTests); + if(currentBestSolution != null) { + logger.info("Combined solution:{}", currentBestSolution.getDescription().toString().replace("\n", "")); + logger.info(currentBestSolution.getScore().toString()); + } else { + logger.info("Could not find a solution in the given time."); + } + } + + /** + * This method can be called for clean up of the solutions and re-ranking. + */ + private void postProcess() { + logger.trace("Post processing ..."); + // pick solutions with same accuracy, i.e. in the pos only case + // covering the same number of positive examples + SortedSet solutions = getSolutions(); + // pick solutions with accuracy above + // mas(maximum achievable score) - noise + List solutionsForPostProcessing = new ArrayList<>(); + for (EvaluatedRDFResourceTree solution : solutions) { + + double accuracy = solution.getTreeScore().getAccuracy(); + + double mas = heuristic.getMaximumAchievableScore(solution); + + double epsilon = 0.01; + + if(accuracy != mas && accuracy >= (mas - noise - epsilon)) { + solutionsForPostProcessing.add(solution); + } + } + + logger.trace("Finished post processing."); + } + + /** + * Compute a (partial) solution that covers as much positive examples as possible. + * @return a (partial) solution + */ + private EvaluatedRDFResourceTree computeBestPartialSolution(){ + logger.info("Computing best partial solution..."); + bestCurrentScore = Double.NEGATIVE_INFINITY; + partialSolutionStartTime = System.currentTimeMillis(); + initTodoList(currentPosExampleTrees, currentNegExampleTrees); + + // generate id for each pos and neg example tree + TObjectIntMap index = new TObjectIntHashMap<>(this.currentPosExampleTrees.size() + this.currentNegExampleTrees.size()); + int id = 1; + for (RDFResourceTree posTree : currentPosExampleTrees) { + index.put(posTree, id++); + } + Set> processedCombinations = new HashSet<>(); + + ExecutorService pool = Executors.newFixedThreadPool(nrOfThreads); + + while(!partialSolutionTerminationCriteriaSatisfied()){ + logger.trace("ToDo list size: " + todoList.size()); + // pick best element from todo list + EvaluatedRDFResourceTree currentElement = todoList.poll(); + final RDFResourceTree currentTree = currentElement.getTree(); + + logger.trace("Next tree: {} ({})", currentElement.getBaseQueryTrees(), currentElement.getTreeScore()); + + // generate the LGG between the chosen tree and each false negative resp. uncovered positive example + Collection falseNegatives = currentElement.getFalseNegatives(); + + if(falseNegatives.isEmpty()) { // if the current solution covers already all pos examples +// addToSolutions(bestPartialSolutionTree); +// bestPartialSolutionTree = currentElement; + } + + List> list = falseNegatives.stream() + .filter(fn -> !processedCombinations.contains(Sets.union(currentElement.getBaseQueryTrees(), Sets.newHashSet(fn)))) + .map(fn -> CompletableFuture.supplyAsync(() -> computePartialSolution(currentTree, fn, Sets.newTreeSet(Sets.union(currentElement.getBaseQueryTrees(), Sets.newHashSet(fn)))), pool)) + .map(solutionFuture -> solutionFuture.thenAccept(solutions -> { + for (EvaluatedRDFResourceTree solution : solutions) { + logger.trace("solution: {} ({})", solution.getBaseQueryTrees(), solution.getTreeScore()); + processedCombinations.add(solution.getBaseQueryTrees()); + expressionTests++; + double score = solution.getScore(); + double mas = heuristic.getMaximumAchievableScore(solution); + + if (score >= bestCurrentScore) { + if (score > bestCurrentScore) { + timeBestSolutionFound = getCurrentRuntimeInMilliSeconds(); + logger.info("\tGot better solution after {}ms:" + solution.getTreeScore(), timeBestSolutionFound); +// logger.info("\t" + solutionAsString(solution.asEvaluatedDescription())); + bestCurrentScore = score; + bestPartialSolutionTree = solution; + } + // add to ToDo list, if not already contained in ToDo list or solution list + if (bestCurrentScore == 1.0 || mas > score) { +// todo(solution); + } + } else if (bestCurrentScore == 1.0 || mas >= bestCurrentScore) { // add to ToDo list if max. achievable score is higher +// todo(solution); + } else { + logger.trace("Too weak: {}", solution.getTreeScore()); +// System.err.println(solution.getEvaluatedDescription()); +// System.out.println("Too general"); +// System.out.println("MAS=" + mas + "\nBest=" + bestCurrentScore); +// todo(solution); + } + todo(solution); + addToSolutions(solution); + } + })) + .collect(Collectors.toList()); + list.forEach(c -> { + try { + c.get(); + } catch (InterruptedException e) { + e.printStackTrace(); + } catch (ExecutionException e) { + e.printStackTrace(); + } + }); + + +// while (it.hasNext() && !(useDisjunction && isPartialSolutionTimeExpired()) && !isTimeExpired()) { +// RDFResourceTree uncoveredTree = it.next(); +// logger.trace("Uncovered tree: " + uncoveredTree); +// // we should avoid the computation of lgg(t2,t1) if we already did lgg(t1,t2) +// Set baseQueryTrees = Sets.newTreeSet(currentElement.getBaseQueryTrees()); +// baseQueryTrees.add(uncoveredTree); +//// String s = ""; +//// for (RDFResourceTree queryTree : baseQueryTrees) { +//// s += index.get(queryTree) + ","; +//// } +//// System.err.println(s); +// if (!processedCombinations.add(baseQueryTrees)) { +//// System.err.println("skipping"); +//// continue; +// } +// +// PartialSolutionComputationTask callable = new PartialSolutionComputationTask(currentTree, uncoveredTree, baseQueryTrees); +// Future> future = pool.submit(callable); +// set.add(future); +// } + +// for (Future> future : set) { +// try { +// Set solutions = future.get(); +// +// +// } catch (InterruptedException e) { +// e.printStackTrace(); +// } catch (ExecutionException e) { +// e.printStackTrace(); +// } +// } +// addToSolutions(currentElement); + } + + pool.shutdown(); + try { + pool.awaitTermination(1, TimeUnit.SECONDS); + } catch (InterruptedException e) { + e.printStackTrace(); + } + + long endTime = System.currentTimeMillis(); + logger.info("...finished computing best partial solution in " + (endTime-partialSolutionStartTime) + "ms."); + EvaluatedDescription bestPartialSolution = bestPartialSolutionTree.asEvaluatedDescription(); + + logger.info("Best partial solution: " + solutionAsString(bestPartialSolution) + "\n(" + bestPartialSolution.getScore() + ")"); + + logger.trace("LGG time: " + MonitorFactory.getTimeMonitor("lgg").getTotal() + "ms"); + logger.trace("Avg. LGG time: " + MonitorFactory.getTimeMonitor("lgg").getAvg() + "ms"); + logger.info("#LGG computations: " + MonitorFactory.getTimeMonitor("lgg").getHits()); + + logger.trace("Subsumption test time: " + MonitorFactory.getTimeMonitor("subsumption").getTotal() + "ms"); + logger.trace("Avg. subsumption test time: " + MonitorFactory.getTimeMonitor("subsumption").getAvg() + "ms"); + logger.trace("#Subsumption tests: " + MonitorFactory.getTimeMonitor("subsumption").getHits()); + + return bestPartialSolutionTree; + } + + private String solutionAsString(EvaluatedDescription ed) { + return renderer.render(ed.getDescription()).replace("\n", "").replaceAll("\\\\s{2,}", " "); + } + + private boolean addToSolutions(EvaluatedRDFResourceTree solution) { + for (EvaluatedRDFResourceTree partialSolution : currentPartialSolutions) { + if(QueryTreeUtils.sameTrees(partialSolution.getTree(), solution.getTree())) { + return false; + } + } + return currentPartialSolutions.add(solution); + } + + /** + * Initializes the ToDo list with all distinct trees contained in the given list of positive + * example trees {@code posExamples} and negative example trees {@code negExamples}. + * First, distinct trees are computed and afterwards, for each tree an initial score will be + * computed. + * @param posExamples the positive example trees + * @param negExamples the negative example trees + */ + private void initTodoList(List posExamples, List negExamples){ + todoList = new PriorityBlockingQueue<>(); +// EvaluatedRDFResourceTree dummy = new EvaluatedRDFResourceTree(new QueryTreeImpl((N)"TOP"), trees, 0d); +// todoList.add(dummy); + + // compute distinct trees, i.e. check if some of the trees already cover others + Collection distinctTrees = new ArrayList<>(); + for (RDFResourceTree queryTree : posExamples) { + boolean distinct = true; + for (RDFResourceTree otherTree : distinctTrees) { + if(!queryTree.equals(otherTree)){ + if(QueryTreeUtils.sameTrees(queryTree, otherTree)){ + distinct = false; + break; + } + } + } + if(distinct){ + distinctTrees.add(queryTree); + } + } + + // compute an initial score + for (RDFResourceTree queryTree : distinctTrees) { + EvaluatedRDFResourceTree evaluatedQueryTree = evaluateSimple(queryTree, false); + evaluatedQueryTree.setBaseQueryTrees(Collections.singleton(queryTree)); + todoList.add(evaluatedQueryTree); + } + } + + /** + * @return TRUE if the query tree is already contained in the solutions or + * todo list, otherwise FALSE + */ + private boolean isRedundant(RDFResourceTree tree) { + //check if not already contained in todo list + for (EvaluatedRDFResourceTree evTree : todoList) { + if(QueryTreeUtils.sameTrees(tree, evTree.getTree())){ + logger.trace("Not added to TODO list: Already contained in."); +// logger.trace(evTree.getBaseQueryTrees().toString()); + return true; + } + } + + //check if not already contained in solutions + for (EvaluatedRDFResourceTree evTree : currentPartialSolutions) { + if(QueryTreeUtils.sameTrees(tree, evTree.getTree())){ + logger.trace("Not added to partial solutions list: Already contained in."); + return true; + } + } + return false; + } + + /** + * Add tree to ToDo list if not already contained in that list or the solutions. + * @param solution the solution + */ + private void todo(EvaluatedRDFResourceTree solution){ + logger.trace("Added to TODO list."); + todoList.add(solution); + } + + private EvaluatedRDFResourceTree evaluateSimple(RDFResourceTree tree, boolean useSpecifity){ + //1. get a score for the coverage = recall oriented + //compute positive examples which are not covered by LGG + List uncoveredPositiveExampleTrees = getUncoveredTrees(tree, currentPosExampleTrees); + Set uncoveredPosExamples = new TreeSet<>(); + for (RDFResourceTree queryTree : uncoveredPositiveExampleTrees) { + uncoveredPosExamples.add(tree2Individual.get(queryTree)); + } + //compute negative examples which are covered by LGG + Collection coveredNegativeExampleTrees = getCoveredTrees(tree, currentNegExampleTrees); + Set coveredNegExamples = new TreeSet<>(); + for (RDFResourceTree queryTree : coveredNegativeExampleTrees) { + coveredNegExamples.add(tree2Individual.get(queryTree)); + } + //compute score + int coveredPositiveExamples = currentPosExampleTrees.size() - uncoveredPositiveExampleTrees.size(); + double recall = coveredPositiveExamples / (double)currentPosExampleTrees.size(); + double precision = (coveredNegativeExampleTrees.size() + coveredPositiveExamples == 0) + ? 0 + : coveredPositiveExamples / (double)(coveredPositiveExamples + coveredNegativeExampleTrees.size()); + + double coverageScore = Heuristics.getFScore(recall, precision, beta); + + //2. get a score for the specifity of the query, i.e. how many edges/nodes = precision oriented + int nrOfSpecificNodes = 0; + for (RDFResourceTree childNode : QueryTreeUtils.getNodes(tree)) { + if(!childNode.isVarNode()){ + nrOfSpecificNodes++; + } + } + double specifityScore = 0d; + if(useSpecifity){ + specifityScore = Math.log(nrOfSpecificNodes); + } else { + specifityScore = 1 / (double) nrOfSpecificNodes; + } + + //3.compute the total score + double score = coverageWeight * coverageScore + specifityWeight * specifityScore; + + QueryTreeScore queryTreeScore = new QueryTreeScore(score, coverageScore, + new TreeSet<>(Sets.difference(currentPosExamples, uncoveredPosExamples)), uncoveredPosExamples, + coveredNegExamples, new TreeSet<>(Sets.difference(currentNegExamples, coveredNegExamples)), + specifityScore, nrOfSpecificNodes); + +// QueryTreeScore queryTreeScore = new QueryTreeScore(score, coverageScore, +// null,null,null,null, +// specifityScore, nrOfSpecificNodes); + + EvaluatedRDFResourceTree evaluatedTree = new EvaluatedRDFResourceTree(tree, uncoveredPositiveExampleTrees, coveredNegativeExampleTrees, queryTreeScore); + + //TODO use only the heuristic to compute the score + score = heuristic.getScore(evaluatedTree); + queryTreeScore.setScore(score); + queryTreeScore.setAccuracy(score); + + return evaluatedTree; + } + + /** + * Evaluated a query tree such that it returns a set of evaluated query trees. + * A set is returned because there are several ways how to convert literal nodes. + * @param tree the query tree + * @param useSpecifity whether to use SPECIFITY as measure + * @return a set of evaluated query trees + */ + private Set evaluate(RDFResourceTree tree, boolean useSpecifity){ + Set evaluatedTrees = new TreeSet<>(); + + LiteralNodeSubsumptionStrategy[] strategies = LiteralNodeSubsumptionStrategy.values(); + strategies = new LiteralNodeSubsumptionStrategy[]{ + LiteralNodeSubsumptionStrategy.DATATYPE, +// LiteralNodeSubsumptionStrategy.INTERVAL, +// LiteralNodeSubsumptionStrategy.MIN, +// LiteralNodeSubsumptionStrategy.MAX, + }; + for (LiteralNodeSubsumptionStrategy strategy : strategies) { + // 1. get a score for the coverage = recall oriented + List uncoveredPositiveExampleTrees = new ArrayList<>(); + List coveredNegativeExampleTrees = new ArrayList<>(); + + // compute positive examples which are not covered by LGG + for (RDFResourceTree posTree : currentPosExampleTrees) { +// System.out.print(currentPosExampleTrees.indexOf(posTree) + ":"); + if(!QueryTreeUtils.isSubsumedBy(posTree, tree, entailment, reasoner)){ +// System.err.println(posTree.getStringRepresentation(true));System.err.println(tree.getStringRepresentation(true)); +// System.out.println("FALSE"); + uncoveredPositiveExampleTrees.add(posTree); + } else { +// System.out.println("TRUE"); + } + } + + // compute negative examples which are covered by LGG + for (RDFResourceTree negTree : currentNegExampleTrees) { + if(QueryTreeUtils.isSubsumedBy(negTree, tree, entailment, reasoner)){ + coveredNegativeExampleTrees.add(negTree); + } + } + + // convert to individuals + Set uncoveredPosExamples = asIndividuals(uncoveredPositiveExampleTrees); + Set coveredNegExamples = asIndividuals(coveredNegativeExampleTrees); + + // compute score + int coveredPositiveExamples = currentPosExampleTrees.size() - uncoveredPositiveExampleTrees.size(); + double recall = coveredPositiveExamples / (double)currentPosExampleTrees.size(); + double precision = (coveredNegativeExampleTrees.size() + coveredPositiveExamples == 0) + ? 0 + : coveredPositiveExamples / (double)(coveredPositiveExamples + coveredNegativeExampleTrees.size()); + + double coverageScore = Heuristics.getFScore(recall, precision, beta); + + // 2. get a score for the specifity of the query, i.e. how many edges/nodes = precision oriented + int nrOfSpecificNodes = 0; + for (RDFResourceTree childNode : QueryTreeUtils.getNodes(tree)) { + if(!childNode.isVarNode()){ + nrOfSpecificNodes++; + } + } + double specifityScore = 0d; + if(useSpecifity){ + specifityScore = Math.log(nrOfSpecificNodes); + } + + // 3.compute the total score + double score = coverageWeight * coverageScore + specifityWeight * specifityScore; + + QueryTreeScore queryTreeScore = new QueryTreeScore(score, coverageScore, + new TreeSet<>(Sets.difference(currentPosExamples, uncoveredPosExamples)), uncoveredPosExamples, + coveredNegExamples, new TreeSet<>(Sets.difference(currentNegExamples, coveredNegExamples)), + specifityScore, nrOfSpecificNodes); + + EvaluatedRDFResourceTree evaluatedTree = new EvaluatedRDFResourceTree(tree, uncoveredPositiveExampleTrees, coveredNegativeExampleTrees, queryTreeScore); + + //TODO use only the heuristic to compute the score + score = heuristic.getScore(evaluatedTree); + queryTreeScore.setScore(score); + queryTreeScore.setAccuracy(score); + + evaluatedTrees.add(evaluatedTree); + } + + return evaluatedTrees; + } + + /** + * Evaluated a query tree such that it returns a set of evaluated query trees. + * A set is returned because there are several ways how to convert literal nodes. + * @param tree the query tree + * @param useSpecifity whether to use SPECIFITY as measure + * @return a set of evaluated query trees + */ + private Set evaluate2(RDFResourceTree tree, boolean useSpecifity){ + Set evaluatedTrees = new TreeSet<>(); + + //test different strategies on the conversion of literal nodes + Set combinations = new HashSet<>(); + + for (LiteralNodeConversionStrategy strategy : strategies) { + OWLClassExpression ce = QueryTreeUtils.toOWLClassExpression(tree); + combinations.add(ce); + } + //compute all combinations of different types of facets +// OWLClassExpression ce = tree.asOWLClassExpression(LiteralNodeConversionStrategy.FACET_RESTRICTION); +// combinations = ce.accept(new ClassExpressionLiteralCombination()); + for (OWLClassExpression c : combinations) { + //convert to individuals + SortedSet coveredExamples = reasoner.getIndividuals(c); + Set coveredPosExamples = new TreeSet<>(Sets.intersection(currentPosExamples, coveredExamples)); + Set uncoveredPosExamples = new TreeSet<>(Sets.difference(currentPosExamples, coveredExamples)); + Set coveredNegExamples = new TreeSet<>(Sets.intersection(currentNegExamples, coveredExamples)); + Set uncoveredNegExamples = new TreeSet<>(Sets.difference(currentNegExamples, coveredExamples)); + + //compute score + double recall = coveredPosExamples.size() / (double)currentPosExamples.size(); + double precision = (coveredNegExamples.size() + coveredPosExamples.size() == 0) + ? 0 + : coveredPosExamples.size() / (double)(coveredPosExamples.size() + coveredNegExamples.size()); + + double coverageScore = Heuristics.getFScore(recall, precision, beta); + + //2. get a score for the specificity of the query, i.e. how many edges/nodes = precision oriented + int nrOfSpecificNodes = 0; + for (RDFResourceTree childNode : QueryTreeUtils.getNodes(tree)){ + if(!childNode.isVarNode()){ + nrOfSpecificNodes++; + } + } + double specifityScore = 0d; + if(useSpecifity){ + specifityScore = Math.log(nrOfSpecificNodes); + } + + //3.compute the total score + double score = coverageWeight * coverageScore + specifityWeight * specifityScore; + + QueryTreeScore queryTreeScore = new QueryTreeScore( + score, coverageScore, + coveredPosExamples, uncoveredPosExamples, + coveredNegExamples, uncoveredNegExamples, + specifityScore, nrOfSpecificNodes); + + //TODO use only the heuristic to compute the score + EvaluatedRDFResourceTree evaluatedTree = new EvaluatedRDFResourceTree(tree, + asQueryTrees(uncoveredPosExamples), asQueryTrees(coveredNegExamples), queryTreeScore); + score = heuristic.getScore(evaluatedTree); + queryTreeScore.setScore(score); + queryTreeScore.setAccuracy(score); + + + EvaluatedDescription evaluatedDescription = new EvaluatedDescription(c, queryTreeScore); + + evaluatedTree.setDescription(evaluatedDescription); + + evaluatedTrees.add(evaluatedTree); + } + return evaluatedTrees; + } + + private EvaluatedDescription buildCombinedSolution(){ + EvaluatedDescription bestCombinedSolution = null; + double bestScore = Double.NEGATIVE_INFINITY; + LiteralNodeConversionStrategy[] strategies = LiteralNodeConversionStrategy.values(); + strategies = new LiteralNodeConversionStrategy[]{LiteralNodeConversionStrategy.DATATYPE}; + for (LiteralNodeConversionStrategy strategy : strategies) { + EvaluatedDescription combinedSolution; + if(partialSolutions.size() == 1){ + combinedSolution = partialSolutions.get(0).asEvaluatedDescription(); + } else { + Set disjuncts = new TreeSet<>(); + + Set posCovered = new HashSet<>(); + Set negCovered = new HashSet<>(); + + //build the union of all class expressions + OWLClassExpression partialDescription; + for (EvaluatedRDFResourceTree partialSolution : partialSolutions) { + partialDescription = partialSolution.asEvaluatedDescription().getDescription(); + disjuncts.add(partialDescription); + posCovered.addAll(partialSolution.getTreeScore().getCoveredPositives()); + negCovered.addAll(partialSolution.getTreeScore().getCoveredNegatives()); + } + OWLClassExpression unionDescription = dataFactory.getOWLObjectUnionOf(disjuncts); + + Set posNotCovered = Sets.difference(lp.getPositiveExamples(), posCovered); + Set negNotCovered = Sets.difference(lp.getNegativeExamples(), negCovered); + + //compute the coverage + double recall = posCovered.size() / (double)lp.getPositiveExamples().size(); + double precision = (posCovered.size() + negCovered.size() == 0) + ? 0 + : posCovered.size() / (double)(posCovered.size() + negCovered.size()); + + double coverageScore = Heuristics.getFScore(recall, precision, beta); + +// ScoreTwoValued score = new ScoreTwoValued(posCovered, posNotCovered, negCovered, negNotCovered); +// score.setAccuracy(coverageScore); + QueryTreeScore score = new QueryTreeScore(coverageScore, coverageScore, posCovered, posNotCovered, negCovered, negNotCovered, -1, -1); + + combinedSolution = new EvaluatedDescription(unionDescription, score); + } + if(combinedSolution.getAccuracy() > bestScore){ + bestCombinedSolution = combinedSolution; + bestCurrentScore = combinedSolution.getAccuracy(); + } + } + return bestCombinedSolution; + } + + private void reset(){ + stop = false; + isRunning = true; + + currentBestSolution = null; + partialSolutions = new ArrayList<>(); + + bestCurrentScore = minimumTreeScore; + + MonitorFactory.getTimeMonitor("lgg").reset(); + nanoStartTime = System.nanoTime(); + } + + /* (non-Javadoc) + * @see org.dllearner.core.StoppableLearningAlgorithm#stop() + */ + @Override + public void stop() { + stop = true; + } + + /* (non-Javadoc) + * @see org.dllearner.core.AbstractCELA#getCurrentlyBestDescription() + */ + @Override + public OWLClassExpression getCurrentlyBestDescription() { + return currentBestSolution.getDescription(); + } + + /* (non-Javadoc) + * @see org.dllearner.core.AbstractCELA#getCurrentlyBestEvaluatedDescription() + */ + @Override + public EvaluatedDescription getCurrentlyBestEvaluatedDescription() { + return currentBestSolution; + } + + /* (non-Javadoc) + * @see org.dllearner.core.StoppableLearningAlgorithm#isRunning() + */ + @Override + public boolean isRunning() { + return isRunning; + } + +// @Autowired +// public void setLearningProblem(PosNegLP learningProblem) { +// this.lp = learningProblem; +// } + +// @Autowired + @Override + public void setReasoner(AbstractReasonerComponent reasoner){ + super.setReasoner(reasoner); +// loadModel(); + } + + private void loadModel(){ + model = ModelFactory.createDefaultModel(); + for (KnowledgeSource ks : reasoner.getSources()) { + if(ks instanceof OWLFile){ + try { + model.read(((OWLFile) ks).getURL().openStream(), null); + } catch (IOException e) { + e.printStackTrace(); + } + } else if(ks instanceof OWLAPIOntology){ + ByteArrayInputStream bais = new ByteArrayInputStream(((OWLAPIOntology) ks).getConverter().convert(((OWLAPIOntology) ks).getOntology())); + model.read(bais, null); + try { + bais.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + } + + private Set asIndividuals(Collection trees){ + Set individuals = new HashSet<>(trees.size()); + for (RDFResourceTree queryTree : trees) { + individuals.add(tree2Individual.get(queryTree)); + } + return individuals; + } + + private Set asQueryTrees(Collection individuals){ + Set trees = new HashSet<>(individuals.size()); + for (OWLIndividual ind : individuals) { + trees.add(tree2Individual.inverse().get(ind)); + } + return trees; + } + + /** + * Computes all trees from the given list {@code allTrees} which are subsumed by {@code tree}. + * @param tree the tree + * @param trees all trees + * @return all trees from the given list {@code allTrees} which are subsumed by {@code tree} + */ + private List getCoveredTrees(RDFResourceTree tree, List trees){ + List coveredTrees = new ArrayList<>(); + for (RDFResourceTree queryTree : trees) { + if(QueryTreeUtils.isSubsumedBy(queryTree, tree)){ + coveredTrees.add(queryTree); + } + } + return coveredTrees; + } + + /** + * Computes all trees from the given list {@code trees} which are not subsumed by {@code tree}. + * @param tree the tree + * @param trees the trees + * @return all trees from the given list {@code trees} which are not subsumed by {@code tree}. + */ + private List getUncoveredTrees(RDFResourceTree tree, List trees){ + List uncoveredTrees = new ArrayList<>(); + for (RDFResourceTree queryTree : trees) { + if(!QueryTreeUtils.isSubsumedBy(queryTree, tree)){ + uncoveredTrees.add(queryTree); + } + } + return uncoveredTrees; + } + + private boolean terminationCriteriaSatisfied() { + //stop was called or time expired + if(stop || isTimeExpired()){ + return true; + } + + // stop if there are no more positive examples to cover + if (stopOnFirstDefinition && currentPosExamples.isEmpty()) { + return true; + } + + // we stop when the score of the last tree added is too low + // (indicating that the algorithm could not find anything appropriate + // in the timeframe set) + if (bestCurrentScore < minimumTreeScore) { + return true; + } + + // stop when almost all positive examples have been covered + if (tryFullCoverage) { + return false; + } else { + int maxPosRemaining = (int) Math.ceil(startPosExamplesSize * 0.05d); + return (currentPosExamples.size() <= maxPosRemaining); + } + } + + private boolean partialSolutionTerminationCriteriaSatisfied(){ + return stop || todoList.isEmpty() || currentPosExampleTrees.isEmpty() || (useDisjunction && isPartialSolutionTimeExpired()) || isTimeExpired(); + } + + private boolean isPartialSolutionTimeExpired(){ + return maxTreeComputationTimeInSeconds > 0 && getRemainingPartialSolutionTime() > 0; + } + + private long getRemainingPartialSolutionTime() { + return (long) (maxTreeComputationTimeInSeconds - (System.currentTimeMillis() - partialSolutionStartTime) / 1000); + } + + /** + * Shows the current setup of the algorithm. + */ + private void printSetup(){ + String setup = "Setup:"; + setup += "\n#Pos. examples:" + currentPosExampleTrees.size(); + setup += "\n#Neg. examples:" + currentNegExampleTrees.size(); + setup += "\nHeuristic:" + heuristic.getHeuristicType().name(); + setup += "\nNoise value=" + noise; + setup += "\nbeta=" + beta; + logger.info(setup); + } + + /** + * @param noisePercentage the noisePercentage to set + */ + public void setNoisePercentage(double noisePercentage) { + this.noisePercentage = noisePercentage; + } + + /** + * @param noise the noise to set + */ + public void setNoise(double noise) { + this.noise = noise; + } + + /** + * Default value is 1. Lower values force importance of covering positive examples. + * @param beta the beta to set + */ + public void setBeta(double beta) { + this.beta = beta; + } + + /** + * Set the max. execution time for the computation of a partial solution. If this value isn't set, the + * max. algorithm runtime will be used, thus, in worst case only one partial solution was computed. + * + * @param maxTreeComputationTimeInSeconds the max. computation for a partial solution tree + */ + public void setMaxTreeComputationTimeInSeconds(double maxTreeComputationTimeInSeconds) { + this.maxTreeComputationTimeInSeconds = maxTreeComputationTimeInSeconds; + } + + /** + * @return the heuristic + */ + public QueryTreeHeuristic getHeuristic() { + return heuristic; + } + + /** + * @param heuristic the heuristic to set + */ + public void setHeuristic(QueryTreeHeuristic heuristic) { + this.heuristic = heuristic; + } + + /** + * @param treeFactory the treeFactory to set + */ + public void setTreeFactory(QueryTreeFactory treeFactory) { + this.treeFactory = treeFactory; + } + + public EvaluatedRDFResourceTree getBestSolution(){ + return currentPartialSolutions.last(); + } + + public SortedSet getSolutions(){ + return currentPartialSolutions; + } + + public List getSolutionsAsList(){ + // Collections.sort(list, Collections.reverseOrder()); + return new ArrayList<>(currentPartialSolutions); + } + + /** + * @param positiveExampleTrees the positive example trees to set + */ + public void setPositiveExampleTrees(Map positiveExampleTrees) { + this.currentPosExampleTrees = new ArrayList<>(positiveExampleTrees.values()); + this.currentPosExamples = new HashSet<>(positiveExampleTrees.keySet()); + + for (Entry entry : positiveExampleTrees.entrySet()) { + OWLIndividual ind = entry.getKey(); + RDFResourceTree tree = entry.getValue(); + tree2Individual.put(tree, ind); + } + } + + /** + * @param negativeExampleTrees the negative example trees to set + */ + public void setNegativeExampleTrees(Map negativeExampleTrees) { + this.currentNegExampleTrees = new ArrayList<>(negativeExampleTrees.values()); + this.currentNegExamples = new HashSet<>(negativeExampleTrees.keySet()); + + for (Entry entry : negativeExampleTrees.entrySet()) { + OWLIndividual ind = entry.getKey(); + RDFResourceTree tree = entry.getValue(); + tree2Individual.put(tree, ind); + } + } + + /** + * @param ks the ks to set + */ + @Autowired + public void setKs(SparqlEndpointKS ks) { + this.ks = ks; + } + + /** + * @param maxTreeDepth the maximum depth of the trees, if those have to be generated + * first. The default depth is 2. + */ + public void setMaxTreeDepth(int maxTreeDepth) { + this.maxTreeDepth = maxTreeDepth; + } + + /** + * @return the runtime in ms until the best solution was found + */ + public long getTimeBestSolutionFound() { + return timeBestSolutionFound; + } + + /* (non-Javadoc) + * @see java.lang.Object#clone() + */ + @Override + public Object clone() throws CloneNotSupportedException { + super.clone(); + return new QTL2DisjunctiveMultiThreaded(this); + } + + private Set computePartialSolution(RDFResourceTree tree1, RDFResourceTree tree2, Set baseQueryTrees) { + try { +// System.out.println(baseQueryTrees); + + LGGGeneratorSimple lggGenerator = new LGGGeneratorSimple(); + // compute the LGG + MonitorFactory.getTimeMonitor("lgg").start(); + ((LGGGeneratorSimple) lggGenerator).setTimeout(getRemainingPartialSolutionTime(), TimeUnit.SECONDS); + RDFResourceTree lgg = lggGenerator.getLGG(tree1, tree2); + MonitorFactory.getTimeMonitor("lgg").stop(); +// System.out.println("COMPLETE:" + ((LGGGeneratorSimple)lggGenerator).isComplete()); +// logger.info("LGG: " + lgg.getStringRepresentation()); + + // redundancy check + boolean redundant = isRedundant(lgg); + if (redundant) { + logger.trace("redundant"); + return Collections.emptySet(); + } + + // evaluate the LGG + Set solutions = evaluate(lgg, true); + solutions.forEach(s -> s.setBaseQueryTrees(baseQueryTrees)); + + return solutions; + } catch (Exception e) { + e.printStackTrace(); + } + return null; + } +} diff --git a/components-core/src/main/java/org/dllearner/algorithms/qtl/datastructures/impl/RDFResourceTree.java b/components-core/src/main/java/org/dllearner/algorithms/qtl/datastructures/impl/RDFResourceTree.java index 7dfb117424..2f97aa2de7 100644 --- a/components-core/src/main/java/org/dllearner/algorithms/qtl/datastructures/impl/RDFResourceTree.java +++ b/components-core/src/main/java/org/dllearner/algorithms/qtl/datastructures/impl/RDFResourceTree.java @@ -64,7 +64,7 @@ public enum Rendering { private final int id; public static final Node DEFAULT_VAR_NODE = NodeFactory.createVariable(""); - private static final Node DEFAULT_LITERAL_NODE = NodeFactory.createLiteral(""); + private static final Node DEFAULT_LITERAL_NODE = NodeFactory.createLiteral("DEF"); // a datatype which only exists if node is literal private RDFDatatype datatype; diff --git a/components-core/src/main/java/org/dllearner/algorithms/qtl/operations/lgg/AbstractLGGGenerator.java b/components-core/src/main/java/org/dllearner/algorithms/qtl/operations/lgg/AbstractLGGGenerator.java index 7138c85d2a..728c51a3aa 100644 --- a/components-core/src/main/java/org/dllearner/algorithms/qtl/operations/lgg/AbstractLGGGenerator.java +++ b/components-core/src/main/java/org/dllearner/algorithms/qtl/operations/lgg/AbstractLGGGenerator.java @@ -46,9 +46,12 @@ public abstract class AbstractLGGGenerator implements LGGGenerator { private long timeoutMillis = -1; private long startTime; + + protected volatile boolean stop = false; private void reset() { + stop = false; subCalls = 0; } @@ -71,34 +74,41 @@ public RDFResourceTree getLGG(RDFResourceTree tree1, RDFResourceTree tree2, bool mon.stop(); // apply some post-processing - postProcess(lgg); + lgg = postProcess(lgg); addNumbering(0, lgg); return lgg; } + @Override public void setTimeout(long timeout, TimeUnit timeoutUnits) { this.timeoutMillis = timeoutUnits.toMillis(timeout); } + @Override public long getTimeout() { return timeoutMillis; } + @Override + public void abort() { + stop = true; + } + protected boolean isTimeout() { return System.currentTimeMillis() - startTime >= timeoutMillis; } - protected void postProcess(RDFResourceTree tree) { + protected RDFResourceTree postProcess(RDFResourceTree tree) { // prune the tree according to the given entailment QueryTreeUtils.prune(tree, reasoner, entailment); + return tree; } protected void preProcess(RDFResourceTree tree) { } - private void addNumbering(int nodeId, RDFResourceTree tree){ // tree.setId(nodeId); diff --git a/components-core/src/main/java/org/dllearner/algorithms/qtl/operations/lgg/LGGGeneratorExt.java b/components-core/src/main/java/org/dllearner/algorithms/qtl/operations/lgg/LGGGeneratorExt.java new file mode 100644 index 0000000000..233c7e1cc8 --- /dev/null +++ b/components-core/src/main/java/org/dllearner/algorithms/qtl/operations/lgg/LGGGeneratorExt.java @@ -0,0 +1,232 @@ +/** + * Copyright (C) 2007 - 2016, Jens Lehmann + * + * This file is part of DL-Learner. + * + * DL-Learner is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * DL-Learner is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.dllearner.algorithms.qtl.operations.lgg; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Set; +import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; + +import org.aksw.jena_sparql_api.cache.h2.CacheUtilsH2; +import org.aksw.jena_sparql_api.core.FluentQueryExecutionFactory; +import org.aksw.jena_sparql_api.core.QueryExecutionFactory; +import org.apache.jena.graph.NodeFactory; +import org.dllearner.algorithms.qtl.QueryTreeUtils; +import org.dllearner.algorithms.qtl.datastructures.NodeInv; +import org.dllearner.algorithms.qtl.datastructures.impl.RDFResourceTree; +import org.dllearner.algorithms.qtl.impl.QueryTreeFactory; +import org.dllearner.algorithms.qtl.impl.QueryTreeFactoryBase; +import org.dllearner.algorithms.qtl.util.StopURIsDBpedia; +import org.dllearner.algorithms.qtl.util.StopURIsOWL; +import org.dllearner.algorithms.qtl.util.StopURIsRDFS; +import org.dllearner.algorithms.qtl.util.filters.NamespaceDropStatementFilter; +import org.dllearner.algorithms.qtl.util.filters.PredicateDropStatementFilter; +import org.dllearner.kb.sparql.ConciseBoundedDescriptionGenerator; +import org.dllearner.kb.sparql.ConciseBoundedDescriptionGeneratorImpl; +import org.dllearner.kb.sparql.SparqlEndpoint; + +import com.google.common.collect.Lists; +import com.google.common.collect.Sets; +import org.apache.jena.datatypes.RDFDatatype; +import org.apache.jena.graph.Node; +import org.apache.jena.rdf.model.Model; +import org.apache.jena.rdf.model.ResourceFactory; +import org.apache.jena.sparql.vocabulary.FOAF; + +/** + * An LGG generator based on syntax and structure only, i.e. without taking into account any type of + * Semantics. + * + * @author Lorenz Bühmann + * + */ +public class LGGGeneratorSimple extends AbstractLGGGenerator { + + private boolean complete = true; + + @Override + protected RDFResourceTree computeLGG(RDFResourceTree tree1, RDFResourceTree tree2, boolean learnFilters){ + subCalls++; + + // 1. compare the root node + // if both root nodes have same URI or literal value, just return one of the two trees as LGG + if ((tree1.isResourceNode() || tree1.isLiteralValueNode()) && tree1.getData().equals(tree2.getData())) { + logger.trace("Early termination. Tree 1 {} and tree 2 {} describe the same resource.", tree1, tree2); + return tree1; + } + + // handle literal nodes with same datatype + if (tree1.isLiteralNode() && tree2.isLiteralNode()) { + RDFDatatype d1 = tree1.getData().getLiteralDatatype(); + RDFDatatype d2 = tree2.getData().getLiteralDatatype(); + + if (d1 != null && d1.equals(d2)) { + return new RDFResourceTree(d1); + } + } + + // else create new empty tree + RDFResourceTree lgg = new RDFResourceTree(); + + // 2. compare the edges + // we only have to compare edges contained in both trees + // outgoing edges + List> commonEdges = new ArrayList<>(); + commonEdges.add(Sets.intersection( + tree1.getEdges().stream().filter(e -> !(e instanceof NodeInv)).collect(Collectors.toSet()), + tree2.getEdges().stream().filter(e -> !(e instanceof NodeInv)).collect(Collectors.toSet()))); + // incoming edges + commonEdges.add(Sets.intersection( + tree1.getEdges().stream().filter(e -> e instanceof NodeInv).collect(Collectors.toSet()), + tree2.getEdges().stream().filter(e -> e instanceof NodeInv).collect(Collectors.toSet()))); + + for (Set edges : commonEdges) { + for (Node edge : edges) { + if(isTimeout()) { + complete = false; + break; + } + Set addedChildren = new HashSet<>(); + // loop over children of first tree + for (RDFResourceTree child1 : tree1.getChildren(edge)) { + if(isTimeout()) { + complete = false; + break; + } + // loop over children of second tree + for (RDFResourceTree child2 : tree2.getChildren(edge)) { + if(isTimeout()) { + complete = false; + break; + } + // compute the LGG + RDFResourceTree lggChild = computeLGG(child1, child2, learnFilters); + + // check if there was already a more specific child computed before + // and if so don't add the current one + boolean add = true; + for (Iterator it = addedChildren.iterator(); it.hasNext(); ) { + RDFResourceTree addedChild = it.next(); + if (QueryTreeUtils.isSubsumedBy(addedChild, lggChild)) { + // logger.trace("Skipped adding: Previously added child {} is subsumed by {}.", + // addedChild.getStringRepresentation(), + // lggChild.getStringRepresentation()); + add = false; + break; + } else if (QueryTreeUtils.isSubsumedBy(lggChild, addedChild)) { + // logger.trace("Removing child node: {} is subsumed by previously added child {}.", + // lggChild.getStringRepresentation(), + // addedChild.getStringRepresentation()); + lgg.removeChild(addedChild, edge); + it.remove(); + } + } + if (add) { + lgg.addChild(lggChild, edge); + addedChildren.add(lggChild); + // logger.trace("Adding child {}", lggChild.getStringRepresentation()); + } + } + } + } + } + + return lgg; + } + + public boolean isComplete() { + return complete; + } + + public static void main(String[] args) throws Exception { + // knowledge base +// SparqlEndpoint endpoint = SparqlEndpoint.getEndpointDBpedia(); +// QueryExecutionFactory qef = FluentQueryExecutionFactory +// .http(endpoint.getURL().toString(), endpoint.getDefaultGraphURIs()).config() +// .withCache(CacheUtilsH2.createCacheFrontend("/tmp/cache", false, TimeUnit.DAYS.toMillis(60))) +// .withPagination(10000).withDelay(50, TimeUnit.MILLISECONDS).end().create(); +// +// // tree generation +// ConciseBoundedDescriptionGenerator cbdGenerator = new ConciseBoundedDescriptionGeneratorImpl(qef); +// int maxDepth = 2; +// cbdGenerator.setRecursionDepth(maxDepth); +// +// QueryTreeFactory treeFactory = new QueryTreeFactoryBase(); +// treeFactory.setMaxDepth(maxDepth); +// treeFactory.addDropFilters( +// new PredicateDropStatementFilter(StopURIsDBpedia.get()), +// new PredicateDropStatementFilter(StopURIsRDFS.get()), +// new PredicateDropStatementFilter(StopURIsOWL.get()), +// new NamespaceDropStatementFilter( +// Sets.newHashSet( +// "http://dbpedia.org/property/", +// "http://purl.org/dc/terms/", +// "http://dbpedia.org/class/yago/", +// "http://www.w3.org/2003/01/geo/wgs84_pos#", +// "http://www.georss.org/georss/", +// FOAF.getURI() +// ) +// ) +// ); +// List trees = new ArrayList<>(); +// List resources = Lists.newArrayList("http://dbpedia.org/resource/Leipzig", "http://dbpedia.org/resource/Dresden"); +// for(String resource : resources){ +// try { +// System.out.println(resource); +// Model model = cbdGenerator.getConciseBoundedDescription(resource); +// RDFResourceTree tree = treeFactory.getQueryTree(ResourceFactory.createResource(resource), model); +// System.out.println(tree.getStringRepresentation()); +// trees.add(tree); +// } catch (Exception e) { +// e.printStackTrace(); +// } +// } + + // LGG computation + LGGGenerator lggGen = new LGGGeneratorSimple(); +// RDFResourceTree lgg = lggGen.getLGG(trees); +// +// System.out.println("LGG"); +// System.out.println(lgg.getStringRepresentation()); +// System.out.println(QueryTreeUtils.toSPARQLQueryString(lgg)); +// System.out.println(QueryTreeUtils.toOWLClassExpression(lgg)); + + Node edge = NodeFactory.createURI("p"); + Node edgeInv = new NodeInv(NodeFactory.createURI("p")); + + RDFResourceTree tree1 = new RDFResourceTree(NodeFactory.createURI("urn:a")); + tree1.addChild(new RDFResourceTree(NodeFactory.createURI("urn:c")), edge); + tree1.addChild(new RDFResourceTree(NodeFactory.createURI("urn:d")), edgeInv); + System.out.println(tree1.getStringRepresentation()); + + RDFResourceTree tree2 = new RDFResourceTree(NodeFactory.createURI("urn:b")); + tree2.addChild(new RDFResourceTree(NodeFactory.createURI("urn:c")), edge); + tree2.addChild(new RDFResourceTree(NodeFactory.createURI("urn:d")), edgeInv); + System.out.println(tree2.getStringRepresentation()); + + RDFResourceTree lgg = lggGen.getLGG(tree1, tree2); + System.out.println("LGG"); + System.out.println(lgg.getStringRepresentation()); + System.out.println(QueryTreeUtils.toSPARQLQueryString(lgg)); + } + +} diff --git a/components-core/src/main/java/org/dllearner/algorithms/qtl/util/filters/TreeFilter.java b/components-core/src/main/java/org/dllearner/algorithms/qtl/util/filters/TreeFilter.java new file mode 100644 index 0000000000..91b59daadf --- /dev/null +++ b/components-core/src/main/java/org/dllearner/algorithms/qtl/util/filters/TreeFilter.java @@ -0,0 +1,7 @@ +package org.dllearner.algorithms.qtl.util.filters; + +/** + * @author Lorenz Buehmann + */ +public interface TreeFilter { +} diff --git a/components-core/src/main/java/org/dllearner/kb/sparql/CBDStructureTree.java b/components-core/src/main/java/org/dllearner/kb/sparql/CBDStructureTree.java index 4cfc85418d..8c8e745a13 100644 --- a/components-core/src/main/java/org/dllearner/kb/sparql/CBDStructureTree.java +++ b/components-core/src/main/java/org/dllearner/kb/sparql/CBDStructureTree.java @@ -43,6 +43,14 @@ public CBDStructureTree addOutNode() { return child; } + public boolean hasOutChild() { + return children.stream().anyMatch(c -> c.isOutNode()); + } + + public boolean hasInChild() { + return children.stream().anyMatch(c -> c.isInNode()); + } + public static CBDStructureTree fromTreeString(String treeString) { treeString = treeString.replace(":", ""); diff --git a/components-core/src/main/java/org/dllearner/utilities/QueryUtils.java b/components-core/src/main/java/org/dllearner/utilities/QueryUtils.java index a6f77e1f38..3205618212 100644 --- a/components-core/src/main/java/org/dllearner/utilities/QueryUtils.java +++ b/components-core/src/main/java/org/dllearner/utilities/QueryUtils.java @@ -51,6 +51,7 @@ import java.io.FileOutputStream; import java.io.IOException; import java.util.*; +import java.util.List; import java.util.Map.Entry; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -309,8 +310,13 @@ private static void getOptimalCBDStructure(Query query, CBDStructureTree structu .filter(tp -> !direction.equals("in") || !tp.getObject().matches(parent)) .collect(Collectors.toSet()); if(!tmp.isEmpty()) { - CBDStructureTree outChild = structureTree.addOutNode(); - + List outChildren = structureTree.getChildren().stream().filter(t -> t.isOutNode()).collect(Collectors.toList()); + CBDStructureTree outChild; + if(outChildren.isEmpty()) { + outChild = structureTree.addOutNode(); + } else { + outChild = outChildren.get(0); + } tmp.stream() .filter(tp -> tp.getObject().isVariable()) .map(tp -> tp.getObject()) diff --git a/components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/BenchmarkDescriptionGenerator.java b/components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/BenchmarkDescriptionGenerator.java new file mode 100644 index 0000000000..1778e121a6 --- /dev/null +++ b/components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/BenchmarkDescriptionGenerator.java @@ -0,0 +1,381 @@ +/** + * Copyright (C) 2007 - 2016, Jens Lehmann + * + * This file is part of DL-Learner. + * + * DL-Learner is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * DL-Learner is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.dllearner.algorithms.qtl.experiments; + +import com.google.common.base.Charsets; +import com.google.common.io.Files; +import com.mxgraph.layout.mxGraphLayout; +import com.mxgraph.layout.orthogonal.mxOrthogonalLayout; +import com.mxgraph.swing.mxGraphComponent; +import com.mxgraph.util.mxCellRenderer; +import com.mxgraph.util.mxConstants; +import com.mxgraph.util.png.mxPngEncodeParam; +import com.mxgraph.util.png.mxPngImageEncoder; +import com.mxgraph.view.mxGraph; +import com.mxgraph.view.mxStylesheet; +import org.aksw.jena_sparql_api.core.QueryExecutionFactory; +import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics; +import org.apache.jena.graph.Node; +import org.apache.jena.graph.Triple; +import org.apache.jena.query.*; +import org.apache.jena.rdf.model.Model; +import org.dllearner.kb.sparql.CBDStructureTree; +import org.dllearner.kb.sparql.TreeBasedConciseBoundedDescriptionGenerator; +import org.dllearner.utilities.ProgressBar; +import org.dllearner.utilities.QueryUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.awt.*; +import java.awt.image.BufferedImage; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.text.DecimalFormat; +import java.text.NumberFormat; +import java.util.*; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; + +/** + * @author Lorenz Buehmann + * + */ +public abstract class BenchmarkDescriptionGenerator { + + private static final Logger LOGGER = LoggerFactory.getLogger(BenchmarkDescriptionGenerator.class); + + private QueryExecutionFactory qef; + private TreeBasedConciseBoundedDescriptionGenerator cbdGen; + private QueryUtils utils = new QueryUtils(); + + private boolean useConstruct = true; + + protected Set skipQueryTokens = new HashSet<>(); + + protected StringBuilder sb; + + public BenchmarkDescriptionGenerator(QueryExecutionFactory qef) { + this.qef = qef; + cbdGen = new TreeBasedConciseBoundedDescriptionGenerator(qef); + } + + protected abstract void beginDocument(StringBuilder sb); + protected abstract void endDocument(StringBuilder sb); + protected abstract void beginTable(StringBuilder sb); + protected abstract void addRow(StringBuilder sb, QueryData queryData); + protected abstract void endTable(StringBuilder sb); + + public void generateBenchmarkDescription(File benchmarkQueriesFile, File htmlOutputFile, boolean withQueryIdGivenInFile) throws Exception{ + Map id2Query = new HashMap<>(); + int id = 1; + for (String line : Files.readLines(benchmarkQueriesFile, Charsets.UTF_8)) { + String queryString = line; + String idString = String.valueOf(id); + if(withQueryIdGivenInFile) { + idString = queryString.substring(0, queryString.indexOf(",")); + queryString = queryString.substring(queryString.indexOf(",") + 1); + } + Query query = QueryFactory.create(queryString); + id2Query.put(idString, query); + } + generateBenchmarkDescription(id2Query, htmlOutputFile); + } + + public void generateBenchmarkDescription(EvaluationDataset dataset, File htmlOutputFile) throws Exception{ + generateBenchmarkDescription(dataset.sparqlQueries, htmlOutputFile); + } + + public void generateBenchmarkDescription(Map id2Query, File htmlOutputFile) throws Exception{ + StringBuilder sb = new StringBuilder(); + beginDocument(sb); + beginTable(sb); + + File graphDir = new File("/tmp/graphs/"); + graphDir.mkdirs(); + for (Map.Entry entry : id2Query.entrySet()) { + String id = entry.getKey(); + Query query = entry.getValue(); + if (skipQueryTokens.stream().anyMatch(t -> query.toString().contains(t))){ + continue; + } + + System.out.println(query); + +// exportGraph(query, new File("/tmp/graphs/graph" + id + ".png")); + File graphFile = new File(graphDir, "graph" + id + ".png"); + QueryToGraphExporter.exportYedGraph(query, graphFile, true); + + // column: SPARQL query type + SPARQLUtils.QueryType queryType = SPARQLUtils.getQueryType(query); + + // query graph +// row += "\"query\n"; + + // column: depth + int maxDepth = getLongestPath(query); + + List result = SPARQLUtils.getResult(qef, query); + + // column: #instances + int nrOfInstances = result.size(); + + // columns: optimal CBD sizes (min, max, avg) + DescriptiveStatistics optimalCBDSizeStats = determineOptimalCBDSizes(query, result); + + // columns: generic CBD sizes (min, max, avg) + DescriptiveStatistics genericCBDSizeStats = determineDefaultCBDSizes(query, result); + + addRow(sb, new QueryData(id, query, queryType, maxDepth, nrOfInstances, optimalCBDSizeStats, genericCBDSizeStats)); + } + + endTable(sb); + endDocument(sb); + + try { + Files.write(sb, htmlOutputFile, Charsets.UTF_8); + } catch (IOException e) { + e.printStackTrace(); + } + } + + private int getLongestPath(Query query) { + SPARQLUtils.QueryType type = SPARQLUtils.getQueryType(query); + + int length = 0; + if(type == SPARQLUtils.QueryType.IN) { + Set tmp = utils.extractIncomingTriplePatterns(query, query.getProjectVars().get(0)); + while(!tmp.isEmpty()) { + length++; + tmp = tmp.stream() + .filter(tp -> tp.getSubject().isVariable()) + .map(tp -> tp.getSubject()) + .map(s -> utils.extractIncomingTriplePatterns(query, s)) + .flatMap(tps -> tps.stream()) + .collect(Collectors.toSet()); + } + } else if(type == SPARQLUtils.QueryType.OUT) { + Set tmp = utils.extractOutgoingTriplePatterns(query, query.getProjectVars().get(0)); + while(!tmp.isEmpty()) { + length++; + tmp = tmp.stream() + .filter(tp -> tp.getObject().isVariable()) + .map(tp -> tp.getObject()) + .map(o -> utils.extractOutgoingTriplePatterns(query, o)) + .flatMap(tps -> tps.stream()) + .collect(Collectors.toSet()); + } + } else { + length = -1; + } + return length; + } + + private CBDStructureTree getDefaultCBDStructureTree() { + CBDStructureTree defaultCbdStructure = new CBDStructureTree(); + defaultCbdStructure.addOutNode().addOutNode(); + CBDStructureTree inNode = defaultCbdStructure.addInNode(); + inNode.addOutNode(); + inNode.addInNode(); + return defaultCbdStructure; + } + + private DescriptiveStatistics determineDefaultCBDSizes(Query query, List resources) { + DescriptiveStatistics stats = new DescriptiveStatistics(); + NumberFormat df = DecimalFormat.getPercentInstance(); + AtomicInteger idx = new AtomicInteger(1); + + CBDStructureTree cbdStructure = getDefaultCBDStructureTree(); + System.out.println(cbdStructure.toStringVerbose()); + + ProgressBar progressBar = new ProgressBar(); + + resources.forEach(r -> { + long cnt = -1; + if(useConstruct) { + Model cbd = null; + try { +// cbd = cbdGen.getConciseBoundedDescription(r, cbdStructure); +// cnt = cbd.size(); +// System.out.println(r + ":" + cnt); + } catch (Exception e) { + LOGGER.error(e.getMessage(), e.getCause()); + } + + } else { + ParameterizedSparqlString template = SPARQLUtils.CBD_TEMPLATE_DEPTH3.copy(); + template.setIri("uri", r); + try(QueryExecution qe = qef.createQueryExecution(template.toString())) { + ResultSet rs = qe.execSelect(); + cnt = rs.next().getLiteral("cnt").getInt(); + } catch (Exception e) { + LOGGER.error(e.getMessage(), e.getCause()); + } + } + stats.addValue(cnt); + progressBar.update(idx.getAndAdd(1), resources.size()); + + }); + + return stats; + } + + private DescriptiveStatistics determineOptimalCBDSizes(Query query, List resources) { + DescriptiveStatistics stats = new DescriptiveStatistics(); + NumberFormat df = DecimalFormat.getPercentInstance(); + AtomicInteger idx = new AtomicInteger(1); + + CBDStructureTree cbdStructure = QueryUtils.getOptimalCBDStructure(query); + System.out.println(cbdStructure.toStringVerbose()); + + ProgressBar progressBar = new ProgressBar(); + + resources.forEach(r -> { + long cnt = -1; + if(useConstruct) { + Model cbd = null; + try { + cbd = cbdGen.getConciseBoundedDescription(r, cbdStructure); + cnt = cbd.size(); + } catch (Exception e) { + LOGGER.error(e.getMessage(), e.getCause()); + } + + } else { + ParameterizedSparqlString template = SPARQLUtils.CBD_TEMPLATE_DEPTH3.copy(); + template.setIri("uri", r); + try(QueryExecution qe = qef.createQueryExecution(template.toString())) { + ResultSet rs = qe.execSelect(); + cnt = rs.next().getLiteral("cnt").getInt(); + } catch (Exception e) { + LOGGER.error(e.getMessage(), e.getCause()); + } + } + stats.addValue(cnt); + progressBar.update(idx.getAndAdd(1), resources.size()); + + }); + + return stats; + } + + private void exportGraph(Query query, File file) { + mxGraph graph = new mxGraph(); + Object parent = graph.getDefaultParent(); + // Adds cells to the model in a single step + graph.getModel().beginUpdate(); + try + { + Set tps = utils.extractTriplePattern(query); + + Map mapping = new HashMap<>(); + tps.forEach(tp -> { + Object val1 = mapping.putIfAbsent(tp.getSubject(), graph.insertVertex(parent, null, tp.getSubject().toString(query.getPrefixMapping()), 20, 20, 40, 30)); + Object val2 = mapping.putIfAbsent(tp.getObject(), graph.insertVertex(parent, null, tp.getObject().toString(query.getPrefixMapping()), 20, 20, 40, 30)); + }); + tps.forEach(tp -> { + graph.insertEdge(parent, null, tp.getPredicate().toString(query.getPrefixMapping()), mapping.get(tp.getSubject()), mapping.get(tp.getObject())); + }); + + } + finally + { + // Updates the display + graph.getModel().endUpdate(); + } + mxGraphComponent graphComponent = new mxGraphComponent(graph); + + // positioning via jgraphx layouts +// mxHierarchicalLayout layout = new mxHierarchicalLayout(graph); +// layout.setParallelEdgeSpacing(20d); +// layout.setIntraCellSpacing(40d); + mxGraphLayout layout = new mxOrthogonalLayout(graph); + layout.execute(graph.getDefaultParent()); + + Map edgeStyle = new HashMap(); +//edgeStyle.put(mxConstants.STYLE_EDGE, mxConstants.EDGESTYLE_ORTHOGONAL); + edgeStyle.put(mxConstants.STYLE_SHAPE, mxConstants.SHAPE_CONNECTOR); + edgeStyle.put(mxConstants.STYLE_ENDARROW, mxConstants.ARROW_CLASSIC); + edgeStyle.put(mxConstants.STYLE_STROKECOLOR, "#000000"); + edgeStyle.put(mxConstants.STYLE_FONTCOLOR, "#000000"); + edgeStyle.put(mxConstants.STYLE_LABEL_BACKGROUNDCOLOR, "#ffffff"); + + Map nodeStyle = new HashMap<>(); + nodeStyle.put(mxConstants.STYLE_SHAPE, mxConstants.SHAPE_ELLIPSE); + nodeStyle.put(mxConstants.STYLE_VERTICAL_ALIGN, mxConstants.ALIGN_BOTTOM); + + mxStylesheet stylesheet = new mxStylesheet(); + stylesheet.setDefaultEdgeStyle(edgeStyle); + stylesheet.setDefaultVertexStyle(nodeStyle); + + graph.setStylesheet(stylesheet); + +// JFrame frame = new JFrame(); +// frame.getContentPane().add(new mxGraphComponent(adapter)); +// frame.pack(); +// frame.setDefaultCloseOperation(WindowConstants.DISPOSE_ON_CLOSE); +// frame.setVisible(true); + + + + BufferedImage image = mxCellRenderer.createBufferedImage(graph, null, 1, Color.WHITE, true, null); + mxPngEncodeParam param = mxPngEncodeParam.getDefaultEncodeParam(image); + + + try { + FileOutputStream outputStream = new FileOutputStream(file); + mxPngImageEncoder encoder = new mxPngImageEncoder(outputStream, param); + if (image != null) { + encoder.encode(image); + } + outputStream.close(); +// ImageIO.write(image, "PNG", file); + } catch (IOException e) { + e.printStackTrace(); + } + } + + class QueryData { + final String id; + final Query query; + final SPARQLUtils.QueryType queryType; + final int maxTreeDepth; + final int nrOfInstances; + final DescriptiveStatistics optimalCBDSizeStats; + final DescriptiveStatistics determineDefaultCBDSizes; + + public QueryData(String id, Query query, SPARQLUtils.QueryType queryType, int maxTreeDepth, int nrOfInstances, + DescriptiveStatistics optimalCBDSizeStats, DescriptiveStatistics determineDefaultCBDSizes) { + this.id = id; + this.query = query; + this.queryType = queryType; + this.maxTreeDepth = maxTreeDepth; + this.nrOfInstances = nrOfInstances; + this.optimalCBDSizeStats = optimalCBDSizeStats; + this.determineDefaultCBDSizes = determineDefaultCBDSizes; + } + + + } + + +} + + diff --git a/components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/BenchmarkDescriptionGeneratorHTML.java b/components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/BenchmarkDescriptionGeneratorHTML.java index 7f10f9363d..4c48589ff6 100644 --- a/components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/BenchmarkDescriptionGeneratorHTML.java +++ b/components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/BenchmarkDescriptionGeneratorHTML.java @@ -18,104 +18,26 @@ */ package org.dllearner.algorithms.qtl.experiments; -import java.awt.*; -import java.awt.image.BufferedImage; -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.text.DecimalFormat; -import java.text.NumberFormat; -import java.util.*; -import java.util.List; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.stream.Collectors; - -import com.google.common.collect.Lists; -import com.mxgraph.layout.mxGraphLayout; -import com.mxgraph.layout.orthogonal.mxOrthogonalLayout; -import com.mxgraph.swing.mxGraphComponent; -import com.mxgraph.util.mxCellRenderer; -import com.mxgraph.util.mxConstants; -import com.mxgraph.util.png.mxPngEncodeParam; -import com.mxgraph.util.png.mxPngImageEncoder; -import com.mxgraph.view.mxGraph; -import com.mxgraph.view.mxStylesheet; -import org.aksw.jena_sparql_api.core.FluentQueryExecutionFactory; +import joptsimple.OptionParser; +import joptsimple.OptionSet; +import joptsimple.OptionSpec; import org.aksw.jena_sparql_api.core.QueryExecutionFactory; - -import com.google.common.base.Charsets; -import com.google.common.io.Files; -import org.aksw.jena_sparql_api.http.QueryExecutionHttpWrapper; import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics; -import org.apache.jena.graph.Node; -import org.apache.jena.graph.Triple; -import org.apache.jena.query.*; -import org.apache.jena.rdf.model.Model; -import org.apache.jena.riot.WebContent; -import org.apache.jena.sparql.core.Var; -import org.apache.jena.sparql.engine.http.QueryEngineHTTP; import org.dllearner.kb.SparqlEndpointKS; -import org.dllearner.kb.sparql.CBDStructureTree; import org.dllearner.kb.sparql.SparqlEndpoint; -import org.dllearner.kb.sparql.TreeBasedConciseBoundedDescriptionGenerator; -import org.dllearner.utilities.ProgressBar; -import org.dllearner.utilities.QueryUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.File; +import java.net.URL; + /** * @author Lorenz Buehmann * */ -public class BenchmarkDescriptionGeneratorHTML { - - private static final Logger LOGGER = LoggerFactory.getLogger(BenchmarkDescriptionGeneratorHTML.class); - - String style = - "\n" + - "\n" - + "" + - "\n" + - ""; +public class BenchmarkDescriptionGeneratorHTML extends BenchmarkDescriptionGenerator{ - String style2 = + String style = "\n" + "\n" + "\n" + @@ -169,40 +91,34 @@ public class BenchmarkDescriptionGeneratorHTML { "\n" + "\n"; - - private QueryExecutionFactory qef; - private TreeBasedConciseBoundedDescriptionGenerator cbdGen; - private QueryUtils utils = new QueryUtils(); + public BenchmarkDescriptionGeneratorHTML(QueryExecutionFactory qef) { + super(qef); + System.out.println(style); + skipQueryTokens.add("1154"); - private boolean useConstruct = true; + } + public void generateBenchmarkDescription(EvaluationDataset dataset, File htmlOutputFile) throws Exception{ + generateBenchmarkDescription(dataset.sparqlQueries, htmlOutputFile); + } - public BenchmarkDescriptionGeneratorHTML(QueryExecutionFactory qef) { - this.qef = qef; - cbdGen = new TreeBasedConciseBoundedDescriptionGenerator(qef); + @Override + protected void beginDocument(StringBuilder sb) { + sb.append("\n"); + sb.append(style); + sb.append("\n"); } - private List loadQueries(File queriesFile) throws IOException { - List queries = new ArrayList<>(); - - for (String queryString : Files.readLines(queriesFile, Charsets.UTF_8)) { - Query q = QueryFactory.create(queryString); - adjustPrefixes(q); - queries.add(q); - } - return queries; + @Override + protected void endDocument(StringBuilder sb) { + sb.append("\n\n"); } - - public void generateBenchmarkDescription(File benchmarkQueriesFile, File htmlOutputFile) throws Exception{ - List queries = loadQueries(benchmarkQueriesFile); - - Var var = Var.alloc("s"); - String html = "\n"; - html += style2; - html += "\n"; - html += "\n"; + + @Override + protected void beginTable(StringBuilder sb) { + sb.append("
\n"); // table header - html += "\n" + sb.append("\n" // "\n" // + "\n" // + "\n" @@ -212,7 +128,7 @@ public void generateBenchmarkDescription(File benchmarkQueriesFile, File htmlOut + "\n" + "\n" + "\n" - + "\n" +// + "\n" + "\n" + "\n" + "\n" @@ -222,309 +138,82 @@ public void generateBenchmarkDescription(File benchmarkQueriesFile, File htmlOut + "\n" + "\n" + "\n" + - "\n"; - - html += "\n"; - int id = 1; - File graphDir = new File("/tmp/graphs/"); - graphDir.mkdirs(); - for (Query query : queries) { -// if(!query.toString().contains("Sopranos"))continue; -// if(id == 3) break; - System.out.println(query); - -// exportGraph(query, new File("/tmp/graphs/graph" + id + ".png")); - File graphFile = new File(graphDir, "graph" + id + ".png"); -// QueryToGraphExporter.exportYedGraph(query, graphFile, true); - - html += "\n"; - - // column: ID - html += "\n"; - - // column: SPARQL query - html += "\n"; - - // column: SPARQL query type - html += "\n"; - - // query graph - html += "\n"; - - // column: depth - html += "\n"; + "\n"); - List result = SPARQLUtils.getResult(qef, query); - - // column: #instances - int nrOfInstances = result.size(); - html += "\n"; - - // columns: optimal CBD sizes (min, max, avg) - DescriptiveStatistics optimalCBDSizeStats = determineOptimalCBDSizes(query, result); - html += "\n"; - html += "\n"; - html += "\n"; - - // columns: generic CBD sizes (min, max, avg) - DescriptiveStatistics genericCBDSizeStats = determineDefaultCBDSizes(query, result); - html += "\n"; - html += "\n"; - html += "\n"; - - - html += "\n"; -// break; - } - html += "\n"; - html += "
test|CBD|optIDQueryQuery TypeQuery GraphQuery GraphDepth#Instances|CBD|min|CBD|max|CBD|avg
" + id++ + "
" + query.toString().replace("<", "<").replace(">", ">") + "
" + SPARQLUtils.getQueryType(query) + "\"query" + getLongestPath(query) + "" + nrOfInstances + "" + (int)optimalCBDSizeStats.getMin() + "" + (int)optimalCBDSizeStats.getMax() + "" + (int)optimalCBDSizeStats.getMean() + "" + (int)genericCBDSizeStats.getMin() + "" + (int)genericCBDSizeStats.getMax() + "" + (int)genericCBDSizeStats.getMean() + "
\n"; - html += "\n"; - html += "\n"; - - try { - Files.write(html, htmlOutputFile, Charsets.UTF_8); - } catch (IOException e) { - e.printStackTrace(); - } + sb.append("\n"); } - private void adjustPrefixes(Query query) { - query.getPrefixMapping().removeNsPrefix("owl"); - query.getPrefixMapping().removeNsPrefix("rdfs"); - query.getPrefixMapping().removeNsPrefix("foaf"); - query.getPrefixMapping().removeNsPrefix("rdf"); + @Override + protected void addRow(StringBuilder sb, QueryData queryData) { + sb.append("\n"); - if(query.toString().contains("http://dbpedia.org/ontology/")) { - query.getPrefixMapping().setNsPrefix("dbo", "http://dbpedia.org/ontology/"); - } - if(query.toString().contains("http://dbpedia.org/property/")) { - query.getPrefixMapping().setNsPrefix("dbp", "http://dbpedia.org/property/"); - } - if(query.toString().contains("http://xmlns.com/foaf/0.1/")) { - query.getPrefixMapping().setNsPrefix("foaf", "http://xmlns.com/foaf/0.1/"); - } - if(query.toString().contains("http://www.w3.org/1999/02/22-rdf-syntax-ns#") || query.toString().contains(" a ")) { - query.getPrefixMapping().setNsPrefix("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#"); - } - if(query.toString().contains("http://dbpedia.org/resource/")) { - query.getPrefixMapping().setNsPrefix("", "http://dbpedia.org/resource/"); - } - } + // column: ID + sb.append("" + queryData.id + "\n"); - private int getLongestPath(Query query) { - SPARQLUtils.QueryType type = SPARQLUtils.getQueryType(query); + // column: SPARQL query + sb.append("
" + queryData.query.toString().replace("<", "<").replace(">", ">") + "
\n"); - int length = 0; - if(type == SPARQLUtils.QueryType.IN) { - Set tmp = utils.extractIncomingTriplePatterns(query, query.getProjectVars().get(0)); - while(!tmp.isEmpty()) { - length++; - tmp = tmp.stream() - .filter(tp -> tp.getSubject().isVariable()) - .map(tp -> tp.getSubject()) - .map(s -> utils.extractIncomingTriplePatterns(query, s)) - .flatMap(tps -> tps.stream()) - .collect(Collectors.toSet()); - } - } else if(type == SPARQLUtils.QueryType.OUT) { - Set tmp = utils.extractOutgoingTriplePatterns(query, query.getProjectVars().get(0)); - while(!tmp.isEmpty()) { - length++; - tmp = tmp.stream() - .filter(tp -> tp.getObject().isVariable()) - .map(tp -> tp.getObject()) - .map(o -> utils.extractOutgoingTriplePatterns(query, o)) - .flatMap(tps -> tps.stream()) - .collect(Collectors.toSet()); - } - } else { - length = -1; - } - return length; - } + // column: SPARQL query type + sb.append("" + queryData.queryType + "\n"); - private CBDStructureTree getDefaultCBDStructureTree() { - CBDStructureTree defaultCbdStructure = new CBDStructureTree(); - defaultCbdStructure.addOutNode().addOutNode(); - CBDStructureTree inNode = defaultCbdStructure.addInNode(); - inNode.addOutNode(); - inNode.addInNode(); - return defaultCbdStructure; - } + // query graph +// QueryToGraphExporter.exportYedGraph(queryData.query, new File("")); +// sb.append("\"query\n"); - private DescriptiveStatistics determineDefaultCBDSizes(Query query, List resources) { - DescriptiveStatistics stats = new DescriptiveStatistics(); - NumberFormat df = DecimalFormat.getPercentInstance(); - AtomicInteger idx = new AtomicInteger(1); + // column: depth + sb.append("" + queryData.maxTreeDepth + "\n"); - CBDStructureTree cbdStructure = getDefaultCBDStructureTree(); - System.out.println(cbdStructure.toStringVerbose()); + // column: #instances + sb.append("" + queryData.nrOfInstances + "\n"); - ProgressBar progressBar = new ProgressBar(); + // columns: optimal CBD sizes (min, max, avg) + DescriptiveStatistics optimalCBDSizeStats = queryData.optimalCBDSizeStats; + sb.append("" + (int)optimalCBDSizeStats.getMin() + "\n"); + sb.append("" + (int)optimalCBDSizeStats.getMax() + "\n"); + sb.append("" + (int)optimalCBDSizeStats.getMean() + "\n"); - resources.forEach(r -> { - long cnt = -1; - if(useConstruct) { - Model cbd = null; - try { - cbd = cbdGen.getConciseBoundedDescription(r, cbdStructure); - cnt = cbd.size(); - System.out.println(r + ":" + cnt); - } catch (Exception e) { - LOGGER.error(e.getMessage(), e.getCause()); - } + // columns: generic CBD sizes (min, max, avg) + DescriptiveStatistics genericCBDSizeStats = queryData.determineDefaultCBDSizes; + sb.append("" + (int)genericCBDSizeStats.getMin() + "\n"); + sb.append("" + (int)genericCBDSizeStats.getMax() + "\n"); + sb.append("" + (int)genericCBDSizeStats.getMean() + "\n"); - } else { - ParameterizedSparqlString template = SPARQLUtils.CBD_TEMPLATE_DEPTH3.copy(); - template.setIri("uri", r); - try(QueryExecution qe = qef.createQueryExecution(template.toString())) { - ResultSet rs = qe.execSelect(); - cnt = rs.next().getLiteral("cnt").getInt(); - } catch (Exception e) { - LOGGER.error(e.getMessage(), e.getCause()); - } - } - stats.addValue(cnt); - progressBar.update(idx.getAndAdd(1), resources.size()); - }); - - return stats; + sb.append("\n"); } - private DescriptiveStatistics determineOptimalCBDSizes(Query query, List resources) { - DescriptiveStatistics stats = new DescriptiveStatistics(); - NumberFormat df = DecimalFormat.getPercentInstance(); - AtomicInteger idx = new AtomicInteger(1); - - CBDStructureTree cbdStructure = QueryUtils.getOptimalCBDStructure(query); - System.out.println(cbdStructure.toStringVerbose()); - - ProgressBar progressBar = new ProgressBar(); - - resources.forEach(r -> { - long cnt = -1; - if(useConstruct) { - Model cbd = null; - try { - cbd = cbdGen.getConciseBoundedDescription(r, cbdStructure); - cnt = cbd.size(); - } catch (Exception e) { - LOGGER.error(e.getMessage(), e.getCause()); - } - - } else { - ParameterizedSparqlString template = SPARQLUtils.CBD_TEMPLATE_DEPTH3.copy(); - template.setIri("uri", r); - try(QueryExecution qe = qef.createQueryExecution(template.toString())) { - ResultSet rs = qe.execSelect(); - cnt = rs.next().getLiteral("cnt").getInt(); - } catch (Exception e) { - LOGGER.error(e.getMessage(), e.getCause()); - } - } - stats.addValue(cnt); - progressBar.update(idx.getAndAdd(1), resources.size()); - - }); - - return stats; + @Override + protected void endTable(StringBuilder sb) { + sb.append("\n\n"); } - private void exportGraph(Query query, File file) { - mxGraph graph = new mxGraph(); - Object parent = graph.getDefaultParent(); - // Adds cells to the model in a single step - graph.getModel().beginUpdate(); - try - { - Set tps = utils.extractTriplePattern(query); - - Map mapping = new HashMap<>(); - tps.forEach(tp -> { - Object val1 = mapping.putIfAbsent(tp.getSubject(), graph.insertVertex(parent, null, tp.getSubject().toString(query.getPrefixMapping()), 20, 20, 40, 30)); - Object val2 = mapping.putIfAbsent(tp.getObject(), graph.insertVertex(parent, null, tp.getObject().toString(query.getPrefixMapping()), 20, 20, 40, 30)); - }); - tps.forEach(tp -> { - graph.insertEdge(parent, null, tp.getPredicate().toString(query.getPrefixMapping()), mapping.get(tp.getSubject()), mapping.get(tp.getObject())); - }); - - } - finally - { - // Updates the display - graph.getModel().endUpdate(); - } - mxGraphComponent graphComponent = new mxGraphComponent(graph); - - // positioning via jgraphx layouts -// mxHierarchicalLayout layout = new mxHierarchicalLayout(graph); -// layout.setParallelEdgeSpacing(20d); -// layout.setIntraCellSpacing(40d); - mxGraphLayout layout = new mxOrthogonalLayout(graph); - layout.execute(graph.getDefaultParent()); - - Map edgeStyle = new HashMap(); -//edgeStyle.put(mxConstants.STYLE_EDGE, mxConstants.EDGESTYLE_ORTHOGONAL); - edgeStyle.put(mxConstants.STYLE_SHAPE, mxConstants.SHAPE_CONNECTOR); - edgeStyle.put(mxConstants.STYLE_ENDARROW, mxConstants.ARROW_CLASSIC); - edgeStyle.put(mxConstants.STYLE_STROKECOLOR, "#000000"); - edgeStyle.put(mxConstants.STYLE_FONTCOLOR, "#000000"); - edgeStyle.put(mxConstants.STYLE_LABEL_BACKGROUNDCOLOR, "#ffffff"); - - Map nodeStyle = new HashMap<>(); - nodeStyle.put(mxConstants.STYLE_SHAPE, mxConstants.SHAPE_ELLIPSE); - nodeStyle.put(mxConstants.STYLE_VERTICAL_ALIGN, mxConstants.ALIGN_BOTTOM); - - mxStylesheet stylesheet = new mxStylesheet(); - stylesheet.setDefaultEdgeStyle(edgeStyle); - stylesheet.setDefaultVertexStyle(nodeStyle); - - graph.setStylesheet(stylesheet); - -// JFrame frame = new JFrame(); -// frame.getContentPane().add(new mxGraphComponent(adapter)); -// frame.pack(); -// frame.setDefaultCloseOperation(WindowConstants.DISPOSE_ON_CLOSE); -// frame.setVisible(true); - - + public static void main(String[] args) throws Exception{ + OptionParser parser = new OptionParser(); + OptionSpec benchmarkDirectorySpec = parser.accepts("d", "base directory").withRequiredArg().ofType(File.class).required(); + OptionSpec queriesFileSpec = parser.accepts("i", "input queries file").withRequiredArg().ofType(File.class).required(); + OptionSpec outputFileSpec = parser.accepts("o", "target output file").withRequiredArg().ofType(File.class).required(); + OptionSpec endpointURLSpec = parser.accepts("e", "endpoint URL").withRequiredArg().ofType(URL.class).required(); + OptionSpec defaultGraphSpec = parser.accepts("g", "default graph").withRequiredArg().ofType(String.class); + OptionSpec useCacheSpec = parser.accepts("cache", "use cache").withOptionalArg().ofType(Boolean.class).defaultsTo(Boolean.TRUE); + OptionSpec queriesHaveIdSpec = parser.accepts("id", "input file contains ID, SPARQL query").withOptionalArg().ofType(Boolean.class).defaultsTo(Boolean.TRUE); - BufferedImage image = mxCellRenderer.createBufferedImage(graph, null, 1, Color.WHITE, true, null); - mxPngEncodeParam param = mxPngEncodeParam.getDefaultEncodeParam(image); + OptionSet options = parser.parse(args); + File benchmarkDirectory = options.valueOf(benchmarkDirectorySpec); + File inputFile = options.valueOf(queriesFileSpec); + File outputFile = options.valueOf(outputFileSpec); - try { - FileOutputStream outputStream = new FileOutputStream(file); - mxPngImageEncoder encoder = new mxPngImageEncoder(outputStream, param); - if (image != null) { - encoder.encode(image); - } - outputStream.close(); -// ImageIO.write(image, "PNG", file); - } catch (IOException e) { - e.printStackTrace(); - } - } + URL endpointURL = options.valueOf(endpointURLSpec); + String defaultGraph = options.has(defaultGraphSpec) ? options.valueOf(defaultGraphSpec) : null; + SparqlEndpoint endpoint = SparqlEndpoint.create(endpointURL.toString(), defaultGraph); - public static void main(String[] args) throws Exception{ - if(args.length < 3) { - System.out.println("Usage: BenchmarkDescriptionGeneratorHTML "); - System.exit(0); - } - File source = new File(args[0]); - File target = new File(args[1]); - String endpointURL = args[2]; - String defaultGraph = null; - if(args.length == 4) - defaultGraph = args[3]; - - SparqlEndpoint endpoint = SparqlEndpoint.create(endpointURL, defaultGraph == null ? Collections.EMPTY_LIST : Lists.newArrayList(defaultGraph)); SparqlEndpointKS ks = new SparqlEndpointKS(endpoint); - ks.setCacheDir("/tmp/qtl-eval"); + ks.setUseCache(options.valueOf(useCacheSpec)); + ks.setCacheDir(benchmarkDirectory.getPath()); ks.init(); + BenchmarkDescriptionGeneratorHTML generator = new BenchmarkDescriptionGeneratorHTML(ks.getQueryExecutionFactory()); - generator.generateBenchmarkDescription(source, target); + generator.generateBenchmarkDescription(inputFile, outputFile, options.valueOf(queriesHaveIdSpec)); } } diff --git a/components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/DBpediaEvaluationDataset.java b/components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/DBpediaEvaluationDataset.java index 97c8ff58dc..5a79ab94fc 100644 --- a/components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/DBpediaEvaluationDataset.java +++ b/components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/DBpediaEvaluationDataset.java @@ -22,6 +22,7 @@ import com.google.common.collect.Lists; import com.google.common.collect.Sets; import com.google.common.io.Files; +import org.apache.jena.query.QueryFactory; import org.apache.jena.rdf.model.Statement; import org.apache.jena.shared.PrefixMapping; import org.apache.jena.sparql.vocabulary.FOAF; @@ -29,15 +30,14 @@ import org.dllearner.algorithms.qtl.util.StopURIsOWL; import org.dllearner.algorithms.qtl.util.StopURIsRDFS; import org.dllearner.algorithms.qtl.util.StopURIsSKOS; -import org.dllearner.algorithms.qtl.util.filters.NamespaceDropStatementFilter; -import org.dllearner.algorithms.qtl.util.filters.ObjectDropStatementFilter; -import org.dllearner.algorithms.qtl.util.filters.PredicateDropStatementFilter; +import org.dllearner.algorithms.qtl.util.filters.*; import org.dllearner.core.ComponentInitException; import org.dllearner.kb.SparqlEndpointKS; import org.dllearner.kb.sparql.SparqlEndpoint; import org.dllearner.reasoning.SPARQLReasoner; import java.io.File; +import java.util.HashMap; import java.util.List; import java.util.function.Predicate; @@ -61,7 +61,12 @@ public DBpediaEvaluationDataset(File benchmarkDirectory, SparqlEndpoint endpoint // load SPARQL queries try { - sparqlQueries = Files.readLines(queriesFile, Charsets.UTF_8); + sparqlQueries = new HashMap<>(); + int i = 1; + List lines = Files.readLines(queriesFile, Charsets.UTF_8); + for (String line : lines) { + sparqlQueries.put(String.valueOf(i++), QueryFactory.create(line)); + } } catch (Exception e) { e.printStackTrace(); } @@ -79,6 +84,9 @@ public DBpediaEvaluationDataset(File benchmarkDirectory, SparqlEndpoint endpoint prefixMapping.setNsPrefix("wiki", "http://wikidata.dbpedia.org/resource/"); prefixMapping.setNsPrefix("odp-dul", "http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#"); prefixMapping.setNsPrefix("schema", "http://schema.org/"); + + PredicateExistenceFilter predicateFilter = new PredicateExistenceFilterDBpedia(null); + setPredicateFilter(predicateFilter); } @Override diff --git a/components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/EvaluationDataset.java b/components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/EvaluationDataset.java index 58adfbf2e3..974afb0158 100644 --- a/components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/EvaluationDataset.java +++ b/components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/EvaluationDataset.java @@ -18,30 +18,25 @@ */ package org.dllearner.algorithms.qtl.experiments; -import com.google.common.base.Joiner; import com.jamonapi.Monitor; import com.jamonapi.MonitorFactory; import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics; -import org.apache.jena.graph.Triple; -import org.apache.jena.query.ParameterizedSparqlString; -import org.apache.jena.query.QueryFactory; -import org.apache.jena.query.ResultSet; +import org.apache.jena.query.Query; import org.apache.jena.rdf.model.Model; import org.apache.jena.rdf.model.Statement; import org.apache.jena.shared.PrefixMapping; +import org.dllearner.algorithms.qtl.util.filters.PredicateExistenceFilter; import org.dllearner.core.AbstractReasonerComponent; import org.dllearner.kb.SparqlEndpointKS; import org.dllearner.kb.sparql.ConciseBoundedDescriptionGenerator; import org.dllearner.kb.sparql.SymmetricConciseBoundedDescriptionGeneratorImpl; -import org.dllearner.utilities.QueryUtils; import java.io.File; import java.io.IOException; import java.nio.file.Files; import java.util.ArrayList; -import java.util.Collections; import java.util.List; -import java.util.Set; +import java.util.Map; import java.util.function.Predicate; import java.util.stream.Collectors; @@ -52,8 +47,6 @@ */ public abstract class EvaluationDataset { - - String name; SparqlEndpointKS ks; @@ -62,9 +55,11 @@ public abstract class EvaluationDataset { AbstractReasonerComponent reasoner; - List sparqlQueries; + Map sparqlQueries; List> queryTreeFilters = new ArrayList<>(); + private PredicateExistenceFilter predicateFilter; + public EvaluationDataset(String name) { this.name = name; } @@ -89,7 +84,7 @@ public PrefixMapping getPrefixMapping() { return prefixMapping; } - public List getSparqlQueries() { + public Map getSparqlQueries() { return sparqlQueries; } @@ -97,29 +92,78 @@ public List> getQueryTreeFilters() { return queryTreeFilters; } + public PredicateExistenceFilter getPredicateFilter() { + return predicateFilter; + } + + public void setPredicateFilter(PredicateExistenceFilter predicateFilter) { + this.predicateFilter = predicateFilter; + } + /** - * Writes the SPARQL queries line-wise to file. + * Writes the ID and SPARQL queries line-wise to file. * * @param file the file */ public void saveToDisk(File file) throws IOException { - Files.write(file.toPath(), sparqlQueries.stream().map(q -> q.replace("\n", " ")).collect(Collectors.toList())); + sparqlQueries.entrySet().stream().forEach(entry -> adjustPrefixes(entry.getValue())); + Files.write(file.toPath(), + sparqlQueries.entrySet().stream() + .map(entry -> entry.getKey() + ", " + entry.getValue().toString().replace("\n", " ")) + .collect(Collectors.toList())); + } + + protected void adjustPrefixes(Query query) { + query.getPrefixMapping().removeNsPrefix("owl"); + query.getPrefixMapping().removeNsPrefix("rdfs"); + query.getPrefixMapping().removeNsPrefix("foaf"); + query.getPrefixMapping().removeNsPrefix("rdf"); + + prefixMapping.getNsPrefixMap().entrySet().forEach(entry -> { + if(query.toString().contains(entry.getValue())) { + query.getPrefixMapping().setNsPrefix(entry.getKey(), entry.getValue()); + } + }); + +// if(query.toString().contains("http://dbpedia.org/ontology/")) { +// query.getPrefixMapping().setNsPrefix("dbo", "http://dbpedia.org/ontology/"); +// } +// if(query.toString().contains("http://dbpedia.org/property/")) { +// query.getPrefixMapping().setNsPrefix("dbp", "http://dbpedia.org/property/"); +// } +// if(query.toString().contains("http://xmlns.com/foaf/0.1/")) { +// query.getPrefixMapping().setNsPrefix("foaf", "http://xmlns.com/foaf/0.1/"); +// } +// if(query.toString().contains("http://www.w3.org/1999/02/22-rdf-syntax-ns#") || query.toString().contains(" a ")) { +// query.getPrefixMapping().setNsPrefix("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#"); +// } +// if(query.toString().contains("http://dbpedia.org/resource/")) { +// query.getPrefixMapping().setNsPrefix("", "http://dbpedia.org/resource/"); +// } } public void analyze() { ConciseBoundedDescriptionGenerator cbdGen = new SymmetricConciseBoundedDescriptionGeneratorImpl(ks.getQueryExecutionFactory()); - String tsv = sparqlQueries.stream().map(QueryFactory::create).map(q -> { + String separator = "\t"; + String tsv = sparqlQueries.entrySet().stream().map(entry -> { StringBuilder sb = new StringBuilder(); + + // ID + String id = entry.getKey(); + sb.append(id).append(separator); + + // query + Query q = entry.getValue(); sb.append(q.toString().replace("\n", " ")); try { // get query result List result = SPARQLUtils.getResult(ks.getQueryExecutionFactory(), q); - sb.append("\t").append(result.size()); + sb.append(separator).append(result.size()); // query type SPARQLUtils.QueryType queryType = SPARQLUtils.getQueryType(q); - sb.append("\t").append(queryType.name()); + sb.append(separator).append(queryType.name()); // check CBD sizes and time Monitor mon = MonitorFactory.getTimeMonitor("CBD"); @@ -137,15 +181,15 @@ public void analyze() { .forEach(sizeStats::addValue); // show min., max. and avg. size - sb.append("\t").append(sizeStats.getMin()); - sb.append("\t").append(sizeStats.getMax()); - sb.append("\t").append(sizeStats.getMean()); + sb.append(separator).append(sizeStats.getMin()); + sb.append(separator).append(sizeStats.getMax()); + sb.append(separator).append(sizeStats.getMean()); // show min., max. and avg. CBD time - sb.append("\t").append(mon.getTotal()); - sb.append("\t").append(mon.getMin()); - sb.append("\t").append(mon.getMax()); - sb.append("\t").append(mon.getAvg()); + sb.append(separator).append(mon.getTotal()); + sb.append(separator).append(mon.getMin()); + sb.append(separator).append(mon.getMax()); + sb.append(separator).append(mon.getAvg()); } catch (Exception e) { e.printStackTrace(); } diff --git a/components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/LUBMEvaluationDataset.java b/components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/LUBMEvaluationDataset.java index 9945c4ffde..a0bbc3e2e3 100644 --- a/components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/LUBMEvaluationDataset.java +++ b/components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/LUBMEvaluationDataset.java @@ -24,6 +24,7 @@ import org.aksw.jena_sparql_api.core.FluentQueryExecutionFactory; import org.aksw.jena_sparql_api.core.QueryExecutionFactory; import org.aksw.jena_sparql_api.http.QueryExecutionHttpWrapper; +import org.apache.jena.query.Query; import org.apache.jena.query.QueryFactory; import org.apache.jena.rdf.model.Statement; import org.apache.jena.riot.WebContent; @@ -44,8 +45,9 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Paths; -import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.concurrent.TimeUnit; import java.util.function.Predicate; @@ -80,17 +82,22 @@ public LUBMEvaluationDataset(File benchmarkDirectory, SparqlEndpoint endpoint) { } // read SPARQL queries - sparqlQueries = new ArrayList<>(); + sparqlQueries = new HashMap<>(); try { List lines = Files.readAllLines(Paths.get(QUERIES_FILE)); String query = ""; + String id = null; for (String line : lines) { if(line.startsWith("#")) { query = ""; + if(id == null) { + id = line.replace("#", "").trim(); + } } else if(line.isEmpty()) { if(!query.isEmpty()) { - sparqlQueries.add(query); + sparqlQueries.put(id, QueryFactory.create(query)); + id = null; query = ""; } } else { @@ -131,10 +138,10 @@ public static void main(String[] args) throws Exception{ SparqlEndpoint endpoint = SparqlEndpoint.create("http://sake.informatik.uni-leipzig.de:8890/sparql", "http://lubm.org"); LUBMEvaluationDataset ds = new LUBMEvaluationDataset(new File("/tmp/test"), endpoint); - List queries = ds.getSparqlQueries(); + Map queries = ds.getSparqlQueries(); System.out.println(queries.size()); - queries.forEach(q -> System.out.println(QueryFactory.create(q))); - queries.forEach(q -> System.out.println(ds.getKS().getQueryExecutionFactory().createQueryExecution(q).execSelect().hasNext())); + queries.entrySet().forEach(entry -> System.out.println(entry.getValue())); + queries.entrySet().forEach(entry -> System.out.println(ds.getKS().getQueryExecutionFactory().createQueryExecution(entry.getValue()).execSelect().hasNext())); } diff --git a/components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/PRConvergenceExperiment.java b/components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/PRConvergenceExperiment.java index 1b72f341a5..e61af26167 100644 --- a/components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/PRConvergenceExperiment.java +++ b/components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/PRConvergenceExperiment.java @@ -21,6 +21,7 @@ import com.google.common.base.Charsets; import com.google.common.base.Function; import com.google.common.base.Joiner; +import com.google.common.base.Splitter; import com.google.common.collect.*; import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; @@ -60,6 +61,7 @@ import org.apache.log4j.Logger; import org.apache.log4j.SimpleLayout; import org.dllearner.algorithms.qtl.QTL2Disjunctive; +import org.dllearner.algorithms.qtl.QTL2DisjunctiveMultiThreaded; import org.dllearner.algorithms.qtl.QueryTreeUtils; import org.dllearner.algorithms.qtl.datastructures.impl.EvaluatedRDFResourceTree; import org.dllearner.algorithms.qtl.datastructures.impl.RDFResourceTree; @@ -70,7 +72,6 @@ import org.dllearner.algorithms.qtl.operations.lgg.LGGGeneratorSimple; import org.dllearner.algorithms.qtl.util.Entailment; import org.dllearner.algorithms.qtl.util.filters.PredicateExistenceFilter; -import org.dllearner.algorithms.qtl.util.filters.PredicateExistenceFilterDBpedia; import org.dllearner.algorithms.qtl.util.statistics.TimeMonitors; import org.dllearner.core.ComponentAnn; import org.dllearner.core.ComponentInitException; @@ -147,7 +148,7 @@ enum Baseline { private boolean splitComplexQueries = true; - private PredicateExistenceFilter filter = new PredicateExistenceFilterDBpedia(null); + private PredicateExistenceFilter filter; // the directory where all files, results etc. are maintained private File benchmarkDirectory; @@ -217,10 +218,10 @@ enum Baseline { private long timeStamp; - Set tokens = Sets.newHashSet( + Set queriesToProcessTokens = Sets.newHashSet( // "Natalie_Portman" // "Pakistan" - "Lou_Reed" +// "Lou_Reed" ); Set queriesToOmitTokens = Sets.newHashSet( @@ -261,6 +262,8 @@ public PRConvergenceExperiment(EvaluationDataset dataset, File benchmarkDirector } cacheDirectory = new File(benchmarkDirectory, "cache"); + + filter = dataset.getPredicateFilter(); } private void setupDatabase() { @@ -321,11 +324,11 @@ private void setupDatabase() { stmt.execute(sql); sql = "CREATE TABLE IF NOT EXISTS eval_detailed (" + - "target_query VARCHAR(500)," + + "target_query VARCHAR(700)," + "nrOfExamples TINYINT, " + "noise DOUBLE, " + - "heuristic VARCHAR(100), " + - "heuristic_measure VARCHAR(100), " + + "heuristic VARCHAR(50), " + + "heuristic_measure VARCHAR(50), " + "query_top LONGTEXT, " + "fscore_top DOUBLE, " + "precision_top DOUBLE, " + @@ -399,7 +402,7 @@ private void setupDatabase() { } psInsertDetailEval = conn.prepareStatement(sql); } catch (Exception e) { - e.printStackTrace(); + throw new RuntimeException("Database setup failed", e); } } @@ -415,6 +418,14 @@ private int getKBSize() { return size; } + public void setQueriesToOmitTokens(Collection queriesToOmitTokens) { + this.queriesToOmitTokens.addAll(queriesToOmitTokens); + } + + public void setQueriesToOmitTokens(Set queriesToOmitTokens) { + this.queriesToOmitTokens = queriesToOmitTokens; + } + public void run(int maxNrOfProcessedQueries, int maxTreeDepth, int[] exampleInterval, double[] noiseInterval, HeuristicType[] measures) throws Exception{ this.maxTreeDepth = maxTreeDepth; queryTreeFactory.setMaxDepth(maxTreeDepth); @@ -429,18 +440,17 @@ public void run(int maxNrOfProcessedQueries, int maxTreeDepth, int[] exampleInte this.measures = measures; } - nrOfExamplesIntervals = new int[]{3, 5};//, 10, 20, 30, 40}; boolean posOnly = true; boolean noiseEnabled = false; logger.info("Started QTL evaluation..."); long t1 = System.currentTimeMillis(); - List queries = dataset.getSparqlQueries(); + List queries = dataset.getSparqlQueries().values().stream().map(q -> q.toString()).collect(Collectors.toList()); logger.info("#loaded queries: " + queries.size()); // filter for debugging purposes - queries = queries.stream().filter(q -> tokens.stream().noneMatch(t -> !q.contains(t))).collect(Collectors.toList()); + queries = queries.stream().filter(q -> queriesToProcessTokens.stream().noneMatch(t -> !q.contains(t))).collect(Collectors.toList()); queries = queries.stream().filter(q -> queriesToOmitTokens.stream().noneMatch(t -> q.contains(t))).collect(Collectors.toList()); @@ -567,6 +577,7 @@ public void run(int maxNrOfProcessedQueries, int maxTreeDepth, int[] exampleInte // loop over SPARQL queries for (final String sparqlQuery : queriesToProcess) { +// CBDStructureTree cbdStructure = defaultCbdStructure;//QueryUtils.getOptimalCBDStructure(QueryFactory.create(sparqlQuery)); CBDStructureTree cbdStructure = QueryUtils.getOptimalCBDStructure(QueryFactory.create(sparqlQuery)); tp.submit(() -> { @@ -577,6 +588,9 @@ public void run(int maxNrOfProcessedQueries, int maxTreeDepth, int[] exampleInte // we repeat it n times with different permutations of examples int nrOfPermutations = 1; + if(nrOfExamples >= query2Examples.get(sparqlQuery).correctPosExampleCandidates.size()){ + nrOfPermutations = 1; + } for(int perm = 1; perm <= nrOfPermutations; perm++) { logger.info("Run {}/{}", perm, nrOfPermutations); try { @@ -611,7 +625,8 @@ public void run(int maxNrOfProcessedQueries, int maxTreeDepth, int[] exampleInte PosNegLPStandard lp = new PosNegLPStandard(); lp.setPositiveExamples(examples.posExamplesMapping.keySet()); lp.setNegativeExamples(examples.negExamplesMapping.keySet()); - QTL2Disjunctive la = new QTL2Disjunctive(lp, qef); +// QTL2Disjunctive la = new QTL2Disjunctive(lp, qef); + QTL2DisjunctiveMultiThreaded la = new QTL2DisjunctiveMultiThreaded(lp, qef); la.setRenderer(new org.dllearner.utilities.owl.DLSyntaxObjectRenderer()); la.setReasoner(dataset.getReasoner()); la.setEntailment(Entailment.SIMPLE); @@ -632,14 +647,14 @@ public void run(int maxNrOfProcessedQueries, int maxTreeDepth, int[] exampleInte // the best returned solution by QTL EvaluatedRDFResourceTree bestSolution = solutions.get(0); logger.info("Got " + solutions.size() + " query trees."); - logger.info("Best computed solution:\n" + render(bestSolution.asEvaluatedDescription())); +// logger.info("Best computed solution:\n" + render(bestSolution.asEvaluatedDescription())); logger.info("QTL Score:\n" + bestSolution.getTreeScore()); long runtimeBestSolution = la.getTimeBestSolutionFound(); bestReturnedSolutionRuntimeStats.addValue(runtimeBestSolution); // convert to SPARQL query RDFResourceTree tree = bestSolution.getTree(); - // filter.filter(tree); + tree = filter.apply(tree); String learnedSPARQLQuery = QueryTreeUtils.toSPARQLQueryString( tree, dataset.getBaseIRI(), dataset.getPrefixMapping()); @@ -685,7 +700,7 @@ public void run(int maxNrOfProcessedQueries, int maxTreeDepth, int[] exampleInte } String bestQuery = QueryFactory.create(QueryTreeUtils.toSPARQLQueryString( - filter.filter(bestMatchingTree.getTree()), + filter.apply(bestMatchingTree.getTree()), dataset.getBaseIRI(), dataset.getPrefixMapping())).toString(); if (write2DB) { @@ -1290,6 +1305,7 @@ private List getResultSplitted(String sparqlQuery){ } private Score computeScore(String referenceSparqlQuery, RDFResourceTree tree, double noise) throws Exception{ + logger.info("computing score..."); // apply some filters QueryTreeUtils.removeVarLeafs(tree); QueryTreeUtils.prune(tree, null, Entailment.RDF); @@ -1297,9 +1313,8 @@ private Score computeScore(String referenceSparqlQuery, RDFResourceTree tree, do // remove redundant rdf:type triples QueryTreeUtils.keepMostSpecificTypes(tree, dataset.getReasoner()); - // - PredicateExistenceFilter filter = new PredicateExistenceFilterDBpedia(null); - tree = filter.filter(tree); + // predicates removed which do not contribute if the simply exists without a concrete value + tree = filter.apply(tree); String learnedSPARQLQuery = QueryTreeUtils.toSPARQLQueryString(tree, dataset.getBaseIRI(), dataset.getPrefixMapping()); logger.info("learned SPARQL query:\n{}", learnedSPARQLQuery); @@ -1547,6 +1562,7 @@ public static void main(String[] args) throws Exception { Logger.getLogger(QueryExecutionFactoryCacheEx.class).setLevel(Level.INFO); OptionParser parser = new OptionParser(); + OptionSpec datasetSpec = parser.accepts("dataset", "possible datasets: QALD4-Bio or QALD6-DBpedia").withRequiredArg().ofType(String.class).required(); OptionSpec benchmarkDirectorySpec = parser.accepts("d", "base directory").withRequiredArg().ofType(File.class).required(); OptionSpec queriesFileSpec = parser.accepts("q", "processed queries file").withRequiredArg().ofType(File.class); OptionSpec endpointURLSpec = parser.accepts("e", "endpoint URL").withRequiredArg().ofType(URL.class).required(); @@ -1559,11 +1575,13 @@ public static void main(String[] args) throws Exception { OptionSpec maxQTLRuntimeSpec = parser.accepts("max-qtl-runtime", "max. runtime of each QTL run").withRequiredArg().ofType(Integer.class).defaultsTo(60); OptionSpec nrOfThreadsSpec = parser.accepts("thread-count", "number of threads used for parallel evaluation").withRequiredArg().ofType(Integer.class).defaultsTo(1); - OptionSpec exampleIntervalsSpec = parser.accepts("examples", "comma-separated list of number of examples used in evaluation").withRequiredArg().ofType(String.class); - OptionSpec noiseIntervalsSpec = parser.accepts("noise", "comma-separated list of noise values used in evaluation").withRequiredArg().ofType(String.class); + OptionSpec exampleIntervalsSpec = parser.accepts("examples", "comma-separated list of number of examples used in evaluation").withRequiredArg().ofType(String.class).defaultsTo(""); + OptionSpec noiseIntervalsSpec = parser.accepts("noise", "comma-separated list of noise values used in evaluation").withRequiredArg().ofType(String.class).defaultsTo(""); OptionSpec measuresSpec = parser.accepts("measures", "comma-separated list of measures used in evaluation").withRequiredArg().ofType(String.class); - OptionSet options = parser.parse(args); + OptionSpec queriesToOmitTokensSpec = parser.accepts("omitTokens", "comma-separated list of tokens such that queries containing any of them will be omitted").withRequiredArg().ofType(String.class).defaultsTo(""); + + OptionSet options = parser.parse(args); File benchmarkDirectory = options.valueOf(benchmarkDirectorySpec); boolean write2DB = options.valueOf(write2DBSpec); @@ -1616,9 +1634,25 @@ public static void main(String[] args) throws Exception { } } + List omitTokens = Splitter + .on(",") + .omitEmptyStrings() + .trimResults() + .splitToList(options.valueOf(queriesToOmitTokensSpec)); + // EvaluationDataset dataset = new DBpediaEvaluationDataset(benchmarkDirectory, endpoint, queriesFile); - EvaluationDataset dataset = new QALDEvaluationDataset(benchmarkDirectory); + String datasetName = options.valueOf(datasetSpec); + EvaluationDataset dataset; + if(datasetName.equals("QALD4-Bio")){ + dataset = new QALD4BiomedicalChallengeEvaluationDataset(benchmarkDirectory); + } else if(datasetName.equals("QALD6-DBpedia")){ + dataset = new QALD6DBpediaEvaluationDataset(benchmarkDirectory); + } else { + throw new RuntimeException("Unsupported dataset:" + datasetName); + } + PRConvergenceExperiment eval = new PRConvergenceExperiment(dataset, benchmarkDirectory, write2DB, override, maxQTLRuntime, useEmailNotification, nrOfThreads); + eval.setQueriesToOmitTokens(omitTokens); eval.run(maxNrOfQueries, maxTreeDepth, exampleInterval, noiseInterval, measures); // new QALDExperiment(Dataset.BIOMEDICAL).run(); diff --git a/components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/PredicateExistenceFilterBiomedical.java b/components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/PredicateExistenceFilterBiomedical.java new file mode 100644 index 0000000000..aac1148464 --- /dev/null +++ b/components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/PredicateExistenceFilterBiomedical.java @@ -0,0 +1,135 @@ +/** + * Copyright (C) 2007 - 2016, Jens Lehmann + * + * This file is part of DL-Learner. + * + * DL-Learner is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * DL-Learner is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.dllearner.algorithms.qtl.experiments; + +import com.google.common.base.Charsets; +import com.google.common.io.Files; +import org.apache.jena.graph.Node; +import org.apache.jena.graph.NodeFactory; +import org.apache.jena.query.QueryExecution; +import org.apache.jena.query.QuerySolution; +import org.apache.jena.query.ResultSet; +import org.apache.jena.rdf.model.Resource; +import org.apache.jena.sparql.util.NodeComparator; +import org.dllearner.algorithms.qtl.util.filters.PredicateExistenceFilter; +import org.dllearner.kb.SparqlEndpointKS; +import org.dllearner.kb.sparql.SparqlEndpoint; + +import java.io.File; +import java.io.IOException; +import java.net.URISyntaxException; +import java.util.List; +import java.util.Set; +import java.util.TreeSet; + +/** + * @author Lorenz Buehmann + * + */ +public class PredicateExistenceFilterBiomedical extends PredicateExistenceFilter{ + + private String DIR = "org/dllearner/algorithms/qtl/"; + private String FILE = "biomedical_meaningless_properties.txt"; + private String PATH = DIR + FILE; + + public PredicateExistenceFilterBiomedical() {} + + public void init() { + Set existentialMeaninglessProperties = new TreeSet<>(new NodeComparator()); + + try { + List lines = Files.readLines(new File(this.getClass().getClassLoader().getResource(PATH).toURI()), Charsets.UTF_8); + for (String line : lines) { + if(!line.trim().isEmpty() && !line.startsWith("#")) { + existentialMeaninglessProperties.add(NodeFactory.createURI(line.trim())); + } + } + } catch (IOException | URISyntaxException e) { + e.printStackTrace(); + } + setExistentialMeaninglessProperties(existentialMeaninglessProperties); + } + + public void analyze(SparqlEndpointKS ks) { + Set existentialMeaninglessProperties = new TreeSet<>(new NodeComparator()); + + StringBuilder sb = new StringBuilder(); + + // for each class +// String query = "SELECT ?cls WHERE {[] a ?cls .}"; + + + + + // check data properties + String query = "SELECT DISTINCT ?p WHERE {?s ?p ?o .}"; + + QueryExecution qe = ks.getQueryExecutionFactory().createQueryExecution(query); + ResultSet rs = qe.execSelect(); + while (rs.hasNext()) { + QuerySolution qs = rs.next(); + + Resource property = qs.getResource("p"); + Resource range = qs.getResource("range"); + +// if(range.equals(XSD.xdouble)) { + existentialMeaninglessProperties.add(property.asNode()); +// } + } + qe.close(); + for (Node p : existentialMeaninglessProperties) { + sb.append(p).append("\n"); + } + existentialMeaninglessProperties.clear(); + sb.append("\n\n"); + + // check object properties + query = "SELECT ?p WHERE {?p a owl:ObjectProperty .}"; + + qe = ks.getQueryExecutionFactory().createQueryExecution(query); + rs = qe.execSelect(); + while (rs.hasNext()) { + QuerySolution qs = rs.next(); + + Resource property = qs.getResource("p"); + existentialMeaninglessProperties.add(property.asNode()); + + } + qe.close(); + + for (Node p : existentialMeaninglessProperties) { + sb.append(p).append("\n"); + } + try { + Files.write(sb.toString(), new File(this.getClass().getClassLoader().getResource(PATH).toURI()), Charsets.UTF_8); + } catch (IOException e) { + e.printStackTrace(); + } catch (URISyntaxException e) { + e.printStackTrace(); + } + } + + public static void main(String[] args) throws Exception{ + SparqlEndpoint endpoint = SparqlEndpoint.create("http://sake.informatik.uni-leipzig.de:8890/sparql", "http://biomedical.org"); + SparqlEndpointKS ks = new SparqlEndpointKS(endpoint); + ks.init(); + new PredicateExistenceFilterBiomedical().analyze(ks); + } + +} diff --git a/components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/QALD4BiomedicalChallengeEvaluationDataset.java b/components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/QALD4BiomedicalChallengeEvaluationDataset.java new file mode 100644 index 0000000000..17a98e47af --- /dev/null +++ b/components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/QALD4BiomedicalChallengeEvaluationDataset.java @@ -0,0 +1,239 @@ +/** + * Copyright (C) 2007 - 2016, Jens Lehmann + * + * This file is part of DL-Learner. + * + * DL-Learner is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * DL-Learner is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.dllearner.algorithms.qtl.experiments; + +import com.google.common.collect.Lists; +import org.aksw.jena_sparql_api.cache.h2.CacheUtilsH2; +import org.aksw.jena_sparql_api.core.FluentQueryExecutionFactory; +import org.aksw.jena_sparql_api.core.QueryExecutionFactory; +import org.aksw.jena_sparql_api.http.QueryExecutionHttpWrapper; +import org.apache.jena.query.Query; +import org.apache.jena.query.QueryFactory; +import org.apache.jena.rdf.model.Statement; +import org.apache.jena.riot.WebContent; +import org.apache.jena.shared.PrefixMapping; +import org.apache.jena.sparql.engine.http.QueryEngineHTTP; +import org.dllearner.algorithms.qtl.util.StopURIsRDFS; +import org.dllearner.algorithms.qtl.util.filters.PredicateDropStatementFilter; +import org.dllearner.core.ComponentInitException; +import org.dllearner.kb.SparqlEndpointKS; +import org.dllearner.kb.sparql.SPARQLQueryUtils; +import org.dllearner.kb.sparql.SparqlEndpoint; +import org.dllearner.reasoning.SPARQLReasoner; +import org.dllearner.utilities.QueryUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.w3c.dom.Document; +import org.w3c.dom.NodeList; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import java.io.File; +import java.io.InputStream; +import java.net.MalformedURLException; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; +import java.util.function.Predicate; + +/** + * @author Lorenz Buehmann + * + */ +public class QALD4BiomedicalChallengeEvaluationDataset extends EvaluationDataset { + + private static final Logger log = LoggerFactory.getLogger(QALD4BiomedicalChallengeEvaluationDataset.class); + + private static final String RESOURCES_DIR = "org/dllearner/algorithms/qtl/"; + private static final String TRAIN_FILE = RESOURCES_DIR + "qald-4_biomedical_train.xml"; + private static final String TEST_FILE = RESOURCES_DIR + "qald-4_biomedical_test.xml"; + private static final Map DATASET_FILES = new LinkedHashMap<>(); + static { + DATASET_FILES.put(TRAIN_FILE, "qald-4-bio-train"); + DATASET_FILES.put(TEST_FILE, "qald-4-bio-test"); + } + + private static SparqlEndpoint endpoint; + static { + try { + endpoint = SparqlEndpoint.create( + "http://sake.informatik.uni-leipzig.de:8890/sparql", + Lists.newArrayList("http://biomedical.org")); + } catch (MalformedURLException e) { + e.printStackTrace(); + } + } + + public QALD4BiomedicalChallengeEvaluationDataset(File benchmarkDirectory) { + this(benchmarkDirectory, endpoint); + } + + public QALD4BiomedicalChallengeEvaluationDataset(File benchmarkDirectory, SparqlEndpoint endpoint) { + super("QALD"); + // set KS + File cacheDir = new File(benchmarkDirectory, "cache"); + QueryExecutionFactory qef = FluentQueryExecutionFactory + .http(endpoint.getURL().toString(), endpoint.getDefaultGraphURIs()) + .config().withPostProcessor(qe -> ((QueryEngineHTTP) ((QueryExecutionHttpWrapper) qe).getDecoratee()) + .setModelContentType(WebContent.contentTypeRDFXML)) + .end() + .create(); + qef = CacheUtilsH2.createQueryExecutionFactory(qef, cacheDir.getAbsolutePath() + "/sparql/qtl-AAAI2017-cache;mv_store=false", false, TimeUnit.DAYS.toMillis(7) ); + try { + ks = new SparqlEndpointKS(endpoint); + ks.setCacheDir(cacheDir.getAbsolutePath() + "/sparql/qtl-AAAI2017-cache;mv_store=false"); + ks.setQueryExecutionFactory(qef); + ks.init(); + } catch (ComponentInitException e) { + e.printStackTrace(); + } + + sparqlQueries = new LinkedHashMap<>(); + + prefixMapping = PrefixMapping.Factory.create().withDefaultMappings(PrefixMapping.Standard); + prefixMapping.setNsPrefix("drugbank", "http://www4.wiwiss.fu-berlin.de/drugbank/resource/drugbank/"); + prefixMapping.setNsPrefix("drugs", "http://www4.wiwiss.fu-berlin.de/drugbank/resource/drugs/"); + prefixMapping.setNsPrefix("drug-targets", "http://www4.wiwiss.fu-berlin.de/drugbank/resource/targets/"); + prefixMapping.setNsPrefix("sider", "http://www4.wiwiss.fu-berlin.de/sider/resource/sider/"); + prefixMapping.setNsPrefix("side-effects", "http://www4.wiwiss.fu-berlin.de/sider/resource/side_effects/"); + prefixMapping.setNsPrefix("diseasome", "http://www4.wiwiss.fu-berlin.de/diseasome/resource/diseasome/"); + prefixMapping.setNsPrefix("diseases", "http://www4.wiwiss.fu-berlin.de/diseasome/resource/diseases/"); + + DATASET_FILES.entrySet().forEach(entry -> { + try { + process(entry.getKey(), entry.getValue()); + } catch (Exception e) { + e.printStackTrace(); + } + }); + + + reasoner = new SPARQLReasoner(ks); + try { + reasoner.init(); + } catch (ComponentInitException e) { + e.printStackTrace(); + } + + PredicateExistenceFilterBiomedical predicateFilter = new PredicateExistenceFilterBiomedical(); + predicateFilter.init(); + setPredicateFilter(predicateFilter); + } + + private void process(String datasetFile, String datasetPrefix) throws Exception{ + DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); + DocumentBuilder db = dbf.newDocumentBuilder(); + try (InputStream is = getClass().getClassLoader().getResourceAsStream(datasetFile)) { + Document doc = db.parse(is); + doc.getDocumentElement().normalize(); + NodeList questionNodes = doc.getElementsByTagName("question"); + + QueryUtils triplePatternExtractor = new QueryUtils(); + + for( int i = 0; i < questionNodes.getLength(); i++){ + + org.w3c.dom.Element questionNode = (org.w3c.dom.Element) questionNodes.item(i); + + String id = datasetPrefix + "_" + Integer.valueOf(questionNode.getAttribute("id")); +// String answerType = questionNode.getAttribute("answerType"); + boolean aggregation = false;//Boolean.valueOf(questionNode.getAttribute("aggregation")); + + + // Read SPARQL query + String sparqlQuery = ((org.w3c.dom.Element)questionNode.getElementsByTagName("query").item(0)).getChildNodes().item(0).getNodeValue().trim(); + sparqlQuery = SPARQLQueryUtils.PREFIXES + " " + sparqlQuery; + if(sparqlQuery.contains("OPTIONAL {?uri rdfs:label ?string . FILTER (lang(?string) = 'en') }")){ + sparqlQuery = sparqlQuery.replace("OPTIONAL {?uri rdfs:label ?string . FILTER (lang(?string) = 'en') }", ""); + sparqlQuery = sparqlQuery.replace("FILTER (lang(?string) = 'en')", ""); + sparqlQuery = sparqlQuery.replace("?string", ""); + } + if(sparqlQuery.contains("OPTIONAL {?uri rdfs:label ?string. FILTER (lang(?string) = 'en') }")){ + sparqlQuery = sparqlQuery.replace("OPTIONAL {?uri rdfs:label ?string. FILTER (lang(?string) = 'en') }", ""); + sparqlQuery = sparqlQuery.replace("?string", ""); + } + +// System.out.println(sparqlQuery); + // check if OUT OF SCOPE marked + boolean outOfScope = sparqlQuery.toUpperCase().contains("OUT OF SCOPE"); + + //check if ASK query + boolean askQuery = sparqlQuery.toUpperCase().contains("ASK"); + + boolean containsLimit = sparqlQuery.toUpperCase().contains("LIMIT"); + + boolean containsCount = sparqlQuery.toUpperCase().contains("COUNT"); + + boolean containsFilter = sparqlQuery.toUpperCase().contains("FILTER"); + + boolean containsUNION = sparqlQuery.toUpperCase().contains("UNION"); + + boolean needsSPARQL11 = sparqlQuery.toUpperCase().contains("MINUS") || + sparqlQuery.toUpperCase().contains("EXISTS"); + + boolean singleProjectionVariable = true; + + + if(true + && !needsSPARQL11 + && !aggregation + && !outOfScope + && !containsCount + && !askQuery + && singleProjectionVariable + && !containsLimit + && !containsFilter + && !containsUNION + ){ + Query query = QueryFactory.create(sparqlQuery); + adjustPrefixes(query); + List result = SPARQLUtils.getResult(ks.getQueryExecutionFactory(), query); + boolean isResourceTarget = false; + if(result.get(0).startsWith("http://")) { + isResourceTarget = true; + } + if(isResourceTarget) { + sparqlQueries.put(id, query); + } + } + } + } catch (Exception e) { + log.error("Failed to load QALD dataset.", e); + } + } + + @Override + @SuppressWarnings("unchecked") + public List> getQueryTreeFilters() { + return Lists.newArrayList( + new PredicateDropStatementFilter(StopURIsRDFS.get())); + } + + public static void main(String[] args) throws Exception{ + QALD4BiomedicalChallengeEvaluationDataset ds = new QALD4BiomedicalChallengeEvaluationDataset(new File("/tmp/test"), endpoint); + ds.saveToDisk(new File("/tmp/qald4_bio_queries.txt")); +// List queries = ds.getSparqlQueries(); +// System.out.println(queries.size()); +// queries.forEach(q -> System.out.println(QueryFactory.create(q))); +// queries.forEach(q -> System.out.println(ds.getKS().getQueryExecutionFactory().createQueryExecution(q).execSelect().hasNext())); +// +// ds.analyze(); + } + +} diff --git a/components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/QALDEvaluationDataset.java b/components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/QALD6DBpediaEvaluationDataset.java similarity index 68% rename from components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/QALDEvaluationDataset.java rename to components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/QALD6DBpediaEvaluationDataset.java index 393b30e208..2990a8c620 100644 --- a/components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/QALDEvaluationDataset.java +++ b/components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/QALD6DBpediaEvaluationDataset.java @@ -25,6 +25,7 @@ import org.aksw.jena_sparql_api.core.FluentQueryExecutionFactory; import org.aksw.jena_sparql_api.core.QueryExecutionFactory; import org.aksw.jena_sparql_api.http.QueryExecutionHttpWrapper; +import org.apache.jena.query.Query; import org.apache.jena.query.QueryFactory; import org.apache.jena.rdf.model.Statement; import org.apache.jena.riot.WebContent; @@ -38,9 +39,7 @@ import org.dllearner.algorithms.qtl.util.StopURIsOWL; import org.dllearner.algorithms.qtl.util.StopURIsRDFS; import org.dllearner.algorithms.qtl.util.StopURIsSKOS; -import org.dllearner.algorithms.qtl.util.filters.NamespaceDropStatementFilter; -import org.dllearner.algorithms.qtl.util.filters.ObjectDropStatementFilter; -import org.dllearner.algorithms.qtl.util.filters.PredicateDropStatementFilter; +import org.dllearner.algorithms.qtl.util.filters.*; import org.dllearner.core.ComponentInitException; import org.dllearner.kb.SparqlEndpointKS; import org.dllearner.kb.sparql.SPARQLQueryUtils; @@ -52,10 +51,9 @@ import java.io.File; import java.io.InputStream; import java.net.MalformedURLException; -import java.net.URL; -import java.util.ArrayList; -import java.util.Arrays; +import java.util.LinkedHashMap; import java.util.List; +import java.util.Map; import java.util.concurrent.TimeUnit; import java.util.function.Predicate; import java.util.stream.Collectors; @@ -64,9 +62,9 @@ * @author Lorenz Buehmann * */ -public class QALDEvaluationDataset extends EvaluationDataset { +public class QALD6DBpediaEvaluationDataset extends EvaluationDataset { - private static final Logger log = LoggerFactory.getLogger(QALDEvaluationDataset.class); + private static final Logger log = LoggerFactory.getLogger(QALD6DBpediaEvaluationDataset.class); private static final String TRAIN_URL = "https://github.com/ag-sc/QALD/blob/master/6/data/qald-6-train-multilingual.json?raw=true"; private static final String TEST_URL = "https://github.com/ag-sc/QALD/blob/master/6/data/qald-6-test-multilingual.json?raw=true"; @@ -78,10 +76,11 @@ public class QALDEvaluationDataset extends EvaluationDataset { private static final String RESOURCES_DIR = "org/dllearner/algorithms/qtl/"; private static final String TRAIN_FILE = RESOURCES_DIR + "qald-6-train-multilingual.json"; private static final String TEST_FILE = RESOURCES_DIR + "qald-6-test-multilingual.json"; - private static final String[] DATASET_FILES = { - TRAIN_FILE, - TEST_FILE - }; + private static final Map DATASET_FILES = new LinkedHashMap<>(); + static { + DATASET_FILES.put(TRAIN_FILE, "qald-6-train"); + DATASET_FILES.put(TEST_FILE, "qald-6-test"); + } private static SparqlEndpoint endpoint; @@ -95,11 +94,11 @@ public class QALDEvaluationDataset extends EvaluationDataset { } } - public QALDEvaluationDataset(File benchmarkDirectory) { + public QALD6DBpediaEvaluationDataset(File benchmarkDirectory) { this(benchmarkDirectory, endpoint); } - public QALDEvaluationDataset(File benchmarkDirectory, SparqlEndpoint endpoint) { + public QALD6DBpediaEvaluationDataset(File benchmarkDirectory, SparqlEndpoint endpoint) { super("QALD"); // set KS File cacheDir = new File(benchmarkDirectory, "cache"); @@ -119,66 +118,66 @@ public QALDEvaluationDataset(File benchmarkDirectory, SparqlEndpoint endpoint) { e.printStackTrace(); } - final List questions = new ArrayList<>(); -// Arrays.stream(DATASET_URLS).forEach(ds -> { -// try { -// URL url = new URL(ds); -// try (InputStream is = url.openStream()) { -// questions.addAll(QALDJsonLoader.loadQuestions(is)); -// } catch (Exception e) { -// log.error("Failed to load QALD dataset.", e); -// } -// } catch (MalformedURLException e) { -// e.printStackTrace(); -// } -// }); - Arrays.stream(DATASET_FILES).forEach(file -> { - try (InputStream is = getClass().getClassLoader().getResourceAsStream(file)) { - questions.addAll(QALDJsonLoader.loadQuestions(is)); - } catch (Exception e) { - log.error("Failed to load QALD dataset.", e); - } - }); + sparqlQueries = new LinkedHashMap<>(); + DATASET_FILES.entrySet().forEach(entry -> { + process(entry.getKey(), entry.getValue()); + }); - // prepend missing PREFIXES to SPARQL query - questions.stream().forEach(q -> q.getQuery().setSparql( - SPARQLQueryUtils.PREFIXES + " " + "PREFIX foaf: \n" + q.getQuery().getSparql())); - - // filter the questions - List filteredQuestions = questions.stream() - .filter(QALDPredicates.hasNoAnswer().negate()) // no answer SPARQL query - .filter(q -> !q.isAggregation()) // no aggregation - .filter(q -> q.getAnswertype().equals("resource")) // only resources - .filter(q -> !q.getAnswers().isEmpty()) // skip no answers - .filter(q -> !q.getAnswers().get(0).getAdditionalProperties().containsKey("boolean")) // only resources due to bug in QALD - .filter(QALDPredicates.isUnion().negate()) // skip UNION queries - .filter(QALDPredicates.hasFilter().negate()) // skip FILTER queries - .filter(QALDPredicates.isOnlyDBO()) - .filter(q -> q.getAnswers().get(0).getResults().getBindings().size() >= 2) // result size >= 2 - .filter(QALDPredicates.isObjectTarget().or(QALDPredicates.isSubjectTarget())) -// .filter(q -> q.getQuery().getSparql().toLowerCase().contains("three_dancers")) - .sorted((q1, q2) -> ComparisonChain.start().compare(q1.getId(), q2.getId()).compare(q1.getQuery().getSparql(), q2.getQuery().getSparql()).result()) // sort by ID - .collect(Collectors.toList()); - - // map to SPARQL queries - sparqlQueries = filteredQuestions.stream() - .map(q -> q.getQuery().getSparql()) - .collect(Collectors.toList()); - reasoner = new SPARQLReasoner(ks); try { reasoner.init(); } catch (ComponentInitException e) { e.printStackTrace(); } - + baseIRI = "http://dbpedia.org/resource/"; prefixMapping = PrefixMapping.Factory.create().withDefaultMappings(PrefixMapping.Standard); prefixMapping.setNsPrefix("dbo", "http://dbpedia.org/ontology/"); prefixMapping.setNsPrefix("wiki", "http://wikidata.dbpedia.org/resource/"); prefixMapping.setNsPrefix("odp-dul", "http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#"); prefixMapping.setNsPrefix("schema", "http://schema.org/"); + + PredicateExistenceFilter predicateFilter = new PredicateExistenceFilterDBpedia(null); + setPredicateFilter(predicateFilter); + + } + + private void process(String datasetFile, String datasetPrefix) { + try (InputStream is = getClass().getClassLoader().getResourceAsStream(datasetFile)) { + final List questions = QALDJsonLoader.loadQuestions(is); + + // prepend missing PREFIXES to SPARQL query + questions.stream().forEach(q -> q.getQuery().setSparql( + SPARQLQueryUtils.PREFIXES + " " + "PREFIX foaf: \n" + q.getQuery().getSparql())); + + // filter the questions + List filteredQuestions = questions.stream() + .filter(QALDPredicates.hasNoAnswer().negate()) // no answer SPARQL query + .filter(q -> !q.isAggregation()) // no aggregation + .filter(q -> q.getAnswertype().equals("resource")) // only resources + .filter(q -> !q.getAnswers().isEmpty()) // skip no answers + .filter(q -> !q.getAnswers().get(0).getAdditionalProperties().containsKey("boolean")) // only resources due to bug in QALD + .filter(QALDPredicates.isUnion().negate()) // skip UNION queries + .filter(QALDPredicates.hasFilter().negate()) // skip FILTER queries + .filter(QALDPredicates.isOnlyDBO()) + .filter(q -> q.getAnswers().get(0).getResults().getBindings().size() >= 2) // result size >= 2 + .filter(QALDPredicates.isObjectTarget().or(QALDPredicates.isSubjectTarget())) +// .filter(q -> q.getQuery().getSparql().toLowerCase().contains("three_dancers")) + .sorted((q1, q2) -> ComparisonChain.start().compare(q1.getId(), q2.getId()).compare(q1.getQuery().getSparql(), q2.getQuery().getSparql()).result()) // sort by ID + .collect(Collectors.toList()); + + // map to SPARQL queries + sparqlQueries.putAll(filteredQuestions.stream() + .collect(LinkedHashMap::new, + (m, q) -> m.put(datasetPrefix + "_" + String.valueOf(q.getId()), QueryFactory.create(q.getQuery().getSparql())), + (m, u) -> {})); + + } catch (Exception e) { + log.error("Failed to load QALD dataset.", e); + } + + } @Override @@ -211,7 +210,7 @@ public static void main(String[] args) throws Exception{ SparqlEndpoint endpoint = SparqlEndpoint.create("http://sake.informatik.uni-leipzig.de:8890/sparql", "http://dbpedia.org"); endpoint = SparqlEndpoint.getEndpointDBpedia(); - QALDEvaluationDataset ds = new QALDEvaluationDataset(new File("/tmp/test"), endpoint); + QALD6DBpediaEvaluationDataset ds = new QALD6DBpediaEvaluationDataset(new File("/tmp/test"), endpoint); ds.saveToDisk(new File("/tmp/qald2016-queries.txt")); // List queries = ds.getSparqlQueries(); // System.out.println(queries.size()); diff --git a/components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/QTLEvaluation.java b/components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/QTLEvaluation.java index fd1d5fb09c..23124877dd 100644 --- a/components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/QTLEvaluation.java +++ b/components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/QTLEvaluation.java @@ -479,7 +479,7 @@ public void run(int maxNrOfProcessedQueries, int maxTreeDepth, int[] exampleInte logger.info("Started QTL evaluation..."); long t1 = System.currentTimeMillis(); - List queries = dataset.getSparqlQueries(); + List queries = dataset.getSparqlQueries().values().stream().map(q -> q.toString()).collect(Collectors.toList()); logger.info("#loaded queries: " + queries.size()); // filter for debugging purposes @@ -714,7 +714,7 @@ public void run(int maxNrOfProcessedQueries, int maxTreeDepth, int[] exampleInte } String bestQuery = QueryFactory.create(QueryTreeUtils.toSPARQLQueryString( - filter.filter(bestMatchingTree.getTree()), + filter.apply(bestMatchingTree.getTree()), dataset.getBaseIRI(), dataset.getPrefixMapping())).toString(); if(write2DB) { @@ -1689,7 +1689,7 @@ private Score computeScore(String referenceSparqlQuery, RDFResourceTree tree, do // PredicateExistenceFilter filter = new PredicateExistenceFilterDBpedia(null); - tree = filter.filter(tree); + tree = filter.apply(tree); String learnedSPARQLQuery = QueryTreeUtils.toSPARQLQueryString(tree, dataset.getBaseIRI(), dataset.getPrefixMapping()); logger.info("learned SPARQL query:\n{}", learnedSPARQLQuery); @@ -2007,7 +2007,7 @@ public static void main(String[] args) throws Exception { } // EvaluationDataset dataset = new DBpediaEvaluationDataset(benchmarkDirectory, endpoint, queriesFile); - EvaluationDataset dataset = new QALDEvaluationDataset(benchmarkDirectory); + EvaluationDataset dataset = new QALD6DBpediaEvaluationDataset(benchmarkDirectory); QTLEvaluation eval = new QTLEvaluation(dataset, benchmarkDirectory, write2DB, override, maxQTLRuntime, useEmailNotification, nrOfThreads); eval.run(maxNrOfQueries, maxTreeDepth, exampleInterval, noiseInterval, measures); diff --git a/components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/SPARQLUtils.java b/components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/SPARQLUtils.java index 219bc684cd..98efa25196 100644 --- a/components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/SPARQLUtils.java +++ b/components-core/src/test/java/org/dllearner/algorithms/qtl/experiments/SPARQLUtils.java @@ -51,10 +51,10 @@ public static List getResult(QueryExecutionFactory qef, Query query, Var try(QueryExecution qe = qef.createQueryExecution(query)) { ResultSet rs = qe.execSelect(); return StreamSupport.stream(((Iterable)() -> rs).spliterator(), false) - .map(qs -> qs.getResource(targetVar.getName()).getURI()) + .map(qs -> qs.get(targetVar.getName()).toString()) .collect(Collectors.toList()); } catch (Exception e) { - throw new Exception("Failed to get result", e); + throw new Exception("Failed to get result for query\n" + query, e); } } diff --git a/components-core/src/test/java/org/dllearner/algorithms/qtl/qald/QALDExperiment.java b/components-core/src/test/java/org/dllearner/algorithms/qtl/qald/QALDExperiment.java index a091cb9099..a144e7c22c 100644 --- a/components-core/src/test/java/org/dllearner/algorithms/qtl/qald/QALDExperiment.java +++ b/components-core/src/test/java/org/dllearner/algorithms/qtl/qald/QALDExperiment.java @@ -265,7 +265,7 @@ public void run(){ logger.info("Score:\n" + bestSolution.getTreeScore()); // convert to SPARQL query - String learnedSPARQLQuery = QueryTreeUtils.toSPARQLQueryString(filter.filter(bestSolution.getTree()), kb.baseIRI, kb.prefixMapping); + String learnedSPARQLQuery = QueryTreeUtils.toSPARQLQueryString(filter.apply(bestSolution.getTree()), kb.baseIRI, kb.prefixMapping); Score score = computeScore(sparqlQuery, learnedSPARQLQuery); @@ -296,7 +296,7 @@ public void run(){ logger.info("Position of best covering tree in list: " + position); logger.info("Best covering solution:\n" + bestMatchingTree.asEvaluatedDescription()); logger.info("Tree score: " + bestMatchingTree.getTreeScore()); - String bestLearnedSPARQLQuery = QueryTreeUtils.toSPARQLQueryString(filter.filter(bestMatchingTree.getTree()), kb.baseIRI, kb.prefixMapping); + String bestLearnedSPARQLQuery = QueryTreeUtils.toSPARQLQueryString(filter.apply(bestMatchingTree.getTree()), kb.baseIRI, kb.prefixMapping); precision = bestMatchingScore.getPrecision(); recall = bestMatchingScore.getRecall(); fmeasure = bestMatchingScore.getFmeasure(); @@ -373,7 +373,7 @@ private Pair findBestMatchingTree(Collection ?v2. ?v0 ?v1.