From 4195b0a070e4afd3797e15a8f688611ca1eb693b Mon Sep 17 00:00:00 2001 From: Ashley Sommer Date: Wed, 23 Dec 2020 12:41:50 +1000 Subject: [PATCH] Inplace Mode, for when cloning your datagraph is undesirable - Normally pyshacl will create an in-memory copy of your datagraph before modifying it (when using ontology mixin, or inferencing features) - This might be unwanted if your datagraph is very large or remote and cloning it into memory is not a good option - Enabling inplace mode will bypass this clone step, and apply modification operations directly on your data_graph (use with caution!) - Enable with `inplace=True` kwarg on `validate()`. - Inplace mode is not yet available via the CLI application, and perhaps doesn't even make sense to have it available there. Inferencing will no longer incorrectly place expanded triples into your original data_graph, unless you enable 'inplace' SHACL-JS loader will no longer fail if the `regex` module is not installed (it will fall back to using builtin `re`) SHACL-Rule DASH-tests will now pass when the SHACL-rule is applied on multigraph (Dataset or ConjunctiveGraph) --- CHANGELOG.md | 19 ++++++++- pyproject.toml | 2 +- pyshacl/__init__.py | 2 +- pyshacl/extras/js/loader.py | 10 +++-- pyshacl/rdfutil/clone.py | 63 ++++++++++++++++++++++-------- pyshacl/validate.py | 50 +++++++++++++++++------- test/test_extra.py | 26 +++++++++--- test/test_js/test_js_constraint.py | 4 +- 8 files changed, 133 insertions(+), 43 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5658cea..b17e3b6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,22 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Python PEP 440 Versioning](https://www.python.org/dev/peps/pep-0440/). +## [0.14.1] - 2020-12-23 + +## Added +- Inplace Mode, for when cloning your datagraph is undesirable + - Normally pyshacl will create an in-memory copy of your datagraph before modifying it (when using ontology mixin, or inferencing features) + - This might be unwanted if your datagraph is very large or remote and cloning it into memory is not a good option + - Enabling inplace mode will bypass this clone step, and apply modification operations directly on your data_graph (use with caution!) + - Enable with `inplace=True` kwarg on `validate()`. + - Inplace mode is not yet available via the CLI application, and perhaps doesn't even make sense to have it available there. + +## Fixed +- Inferencing will no longer incorrectly place expanded triples into your original data_graph, unless you enable 'inplace' +- SHACL-JS loader will no longer fail if the `regex` module is not installed (it will fall back to using builtin `re`) +- SHACL-Rule DASH-tests will now pass when the SHACL-rule is applied on multigraph (Dataset or ConjunctiveGraph) + + ## [0.14.0] - 2020-10-14 ## Added @@ -694,7 +710,8 @@ just leaves the files open. Now it is up to the command-line client to close the - Initial version, limited functionality -[Unreleased]: https://github.com/RDFLib/pySHACL/compare/v0.14.0...HEAD +[Unreleased]: https://github.com/RDFLib/pySHACL/compare/v0.14.1...HEAD +[0.14.1]: https://github.com/RDFLib/pySHACL/compare/v0.14.0...v0.14.1 [0.14.0]: https://github.com/RDFLib/pySHACL/compare/v0.13.3...v0.14.0 [0.13.3]: https://github.com/RDFLib/pySHACL/compare/v0.13.2...v0.13.3 [0.13.2]: https://github.com/RDFLib/pySHACL/compare/v0.13.1...v0.13.2 diff --git a/pyproject.toml b/pyproject.toml index afd82f5..c680938 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.masonry.api" [tool.poetry] name = "pyshacl" -version = "0.14.0" +version = "0.14.1" # Don't forget to change the version number in __init__.py along with this one description = "Python SHACL Validator" license = "Apache-2.0" diff --git a/pyshacl/__init__.py b/pyshacl/__init__.py index d0f45fa..ce164da 100644 --- a/pyshacl/__init__.py +++ b/pyshacl/__init__.py @@ -6,7 +6,7 @@ # version compliant with https://www.python.org/dev/peps/pep-0440/ -__version__ = '0.14.0' +__version__ = '0.14.1' # Don't forget to change the version number in pyproject.toml along with this one __all__ = ['validate', 'Validator', '__version__', 'Shape', 'ShapesGraph'] diff --git a/pyshacl/extras/js/loader.py b/pyshacl/extras/js/loader.py index 8fe185c..f8e5d6d 100644 --- a/pyshacl/extras/js/loader.py +++ b/pyshacl/extras/js/loader.py @@ -4,15 +4,19 @@ from urllib import request -import regex +try: + import regex +except ImportError: + import re + regex = re if typing.TYPE_CHECKING: from pyduktape2 import DuktapeContext -JS_FN_RE1 = regex.compile(rb'function\s+([^ \n]+)\s*\((.*)\)\s*\{', regex.MULTILINE, regex.IGNORECASE) +JS_FN_RE1 = regex.compile(rb'function\s+([^ \n]+)\s*\((.*)\)\s*\{', regex.MULTILINE | regex.IGNORECASE) JS_FN_RE2 = regex.compile( - rb'(?:let|const|var)\s+([^ \n]+)\s*=\s*function\s*\((.*)\)\s*\{', regex.MULTILINE, regex.IGNORECASE + rb'(?:let|const|var)\s+([^ \n]+)\s*=\s*function\s*\((.*)\)\s*\{', regex.MULTILINE | regex.IGNORECASE ) diff --git a/pyshacl/rdfutil/clone.py b/pyshacl/rdfutil/clone.py index 9b3b34c..1712c84 100644 --- a/pyshacl/rdfutil/clone.py +++ b/pyshacl/rdfutil/clone.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -from typing import Optional +from typing import Optional, Union import rdflib @@ -62,34 +62,58 @@ def clone_graph(source_graph, target_graph=None, identifier=None): return g -def mix_datasets(base_ds: ConjunctiveLike, extra_ds: GraphLike, target_ds: Optional[ConjunctiveLike] = None): +def mix_datasets(base_ds: ConjunctiveLike, extra_ds: GraphLike, target_ds: Optional[Union[ConjunctiveLike, str]] = None): + """ + Make a clone of base_ds (dataset) and add in the triples from extra_ds (dataset) + :param base_ds: + :type base_ds: rdflib.Dataset + :param extra_ds: + :type extra_ds: rdflib.Dataset + :param target_ds: + :type target_ds: rdflib.Dataset|str|NoneType + :return: The cloned Dataset with mixed in triples from extra_ds + :rtype: rdflib.Dataset + """ default_union = base_ds.default_union base_named_graphs = list(base_ds.contexts()) if target_ds is None: target_ds = rdflib.Dataset(default_union=default_union) + elif target_ds == "inplace": + pass # do nothing here elif not isinstance(target_ds, (rdflib.Dataset, rdflib.ConjunctiveGraph)): raise RuntimeError("Cannot mix datasets if target_ds passed in is not a Dataset itself.") if isinstance(extra_ds, (rdflib.Dataset, rdflib.ConjunctiveGraph)): mixin_graphs = list(extra_ds.contexts()) else: mixin_graphs = [extra_ds] - mixed_graphs = {} - for mg in mixin_graphs: - mod_named_graphs = { - g.identifier: mix_graphs(g, mg, target_graph=rdflib.Graph(store=target_ds.store, identifier=g.identifier)) - for g in base_named_graphs - } - mixed_graphs.update(mod_named_graphs) - default_context_id = target_ds.default_context.identifier - for i, m in mixed_graphs.items(): - if i == default_context_id: - target_ds.store.remove_graph(target_ds.default_context) - target_ds.default_context = m - target_ds.add_graph(m) + if target_ds == "inplace": + target_ds = base_ds + for mg in mixin_graphs: + mod_named_graphs = { + g.identifier: mix_graphs(g, mg, target_graph="inplace") + for g in base_named_graphs + } + elif isinstance(target_ds, str): + raise RuntimeError("target_ds cannot be a string (unless it is 'inplace')") + else: + + mixed_graphs = {} + for mg in mixin_graphs: + mod_named_graphs = { + g.identifier: mix_graphs(g, mg, target_graph=rdflib.Graph(store=target_ds.store, identifier=g.identifier)) + for g in base_named_graphs + } + mixed_graphs.update(mod_named_graphs) + default_context_id = target_ds.default_context.identifier + for i, m in mixed_graphs.items(): + if i == default_context_id: + target_ds.store.remove_graph(target_ds.default_context) + target_ds.default_context = m + target_ds.add_graph(m) return target_ds -def mix_graphs(base_graph: GraphLike, extra_graph: GraphLike, target_graph: Optional[ConjunctiveLike] = None): +def mix_graphs(base_graph: GraphLike, extra_graph: GraphLike, target_graph: Optional[Union[GraphLike, str]] = None): """ Make a clone of base_graph and add in the triples from extra_graph :param base_graph: @@ -97,7 +121,7 @@ def mix_graphs(base_graph: GraphLike, extra_graph: GraphLike, target_graph: Opti :param extra_graph: :type extra_graph: rdflib.Graph :param target_graph: - :type target_graph: rdflib.Graph + :type target_graph: rdflib.Graph|str|NoneType :return: The cloned graph with mixed in triples from extra_graph :rtype: rdflib.Graph """ @@ -105,6 +129,11 @@ def mix_graphs(base_graph: GraphLike, extra_graph: GraphLike, target_graph: Opti return mix_datasets(base_graph, extra_graph, target_ds=target_graph) if target_graph is None: g = clone_graph(base_graph, target_graph=None, identifier=base_graph.identifier) + elif target_graph == "inplace": + # Special case, don't clone the basegraph, just put extra straight in + g = base_graph + elif isinstance(target_graph, str): + raise RuntimeError("target_graph cannot be a string (unless it is 'inplace')") else: g = clone_graph(base_graph, target_graph=target_graph) g = clone_graph(extra_graph, target_graph=g) diff --git a/pyshacl/validate.py b/pyshacl/validate.py index 1dc64f9..01450fb 100644 --- a/pyshacl/validate.py +++ b/pyshacl/validate.py @@ -67,6 +67,8 @@ class Validator(object): def _load_default_options(cls, options_dict: dict): options_dict.setdefault('advanced', False) options_dict.setdefault('inference', 'none') + options_dict.setdefault('inplace', False) + options_dict.setdefault('use_js', False) options_dict.setdefault('abort_on_error', False) if 'logger' not in options_dict: options_dict['logger'] = logging.getLogger(__name__) @@ -79,8 +81,11 @@ def _run_pre_inference( Note, this is the OWL/RDFS pre-inference, it is not the Advanced Spec SHACL-Rule inferencing step. :param target_graph: + :type target_graph: rdflib.Graph|rdflib.ConjunctiveGraph|rdflib.Dataset :param inference_option: + :type inference_option: str :return: + :rtype: NoneType """ if logger is None: logger = logging.getLogger(__name__) @@ -167,6 +172,7 @@ def __init__( self.options = options # type: dict self.logger = options['logger'] # type: logging.Logger self.pre_inferenced = kwargs.pop('pre_inferenced', False) + self.inplace = options['inplace'] if not isinstance(data_graph, rdflib.Graph): raise RuntimeError("data_graph must be a rdflib Graph object") self.data_graph = data_graph @@ -180,7 +186,8 @@ def __init__( shacl_graph = clone_graph(data_graph, identifier='shacl') assert isinstance(shacl_graph, rdflib.Graph), "shacl_graph must be a rdflib Graph object" self.shacl_graph = ShapesGraph(shacl_graph, self.logger) - if self.options.get('use_js', None): + + if options['use_js']: is_js_installed = check_extra_installed('js') if is_js_installed: self.shacl_graph.enable_js() @@ -191,23 +198,27 @@ def target_graph(self): def mix_in_ontology(self): if not self.data_graph_is_multigraph: - return mix_graphs(self.data_graph, self.ont_graph) - return mix_datasets(self.data_graph, self.ont_graph) + return mix_graphs(self.data_graph, self.ont_graph, "inplace" if self.inplace else None) + return mix_datasets(self.data_graph, self.ont_graph, "inplace" if self.inplace else None) def run(self): - if self.ont_graph is not None: - # creates a copy of self.data_graph, doesn't modify it - the_target_graph = self.mix_in_ontology() + if self.target_graph is not None: + the_target_graph = self.target_graph else: - the_target_graph = self.data_graph - inference_option = self.options.get('inference', 'none') - if inference_option: - if self.pre_inferenced: - the_target_graph = self._target_graph - elif str(inference_option) != "none": + has_cloned = False + if self.ont_graph is not None: + # creates a copy of self.data_graph, doesn't modify it + the_target_graph = self.mix_in_ontology() + has_cloned = True + else: + the_target_graph = self.data_graph + inference_option = self.options.get('inference', 'none') + if inference_option and not self.pre_inferenced and str(inference_option) != "none": + if not has_cloned and not self.inplace: + the_target_graph = clone_graph(the_target_graph) self._run_pre_inference(the_target_graph, inference_option, self.logger) self.pre_inferenced = True - self._target_graph = the_target_graph + self._target_graph = the_target_graph shapes = self.shacl_graph.shapes # This property getter triggers shapes harvest. @@ -304,6 +315,7 @@ def validate( ont_graph: Optional[Union[GraphLike, str, bytes]] = None, advanced: Optional[bool] = False, inference: Optional[str] = None, + inplace: Optional[bool] = False, abort_on_error: Optional[bool] = False, **kwargs, ): @@ -321,6 +333,8 @@ def validate( :type advanced: bool | None :param inference: One of "rdfs", "owlrl", "both", "none", or None :type inference: str | None + :param inplace: If this is enabled, do not clone the datagraph, manipulate it inplace + :type inplace: bool :param abort_on_error: :type abort_on_error: bool | None :param kwargs: @@ -366,6 +380,7 @@ def validate( ont_graph=ont_graph, options={ 'inference': inference, + 'inplace': inplace, 'abort_on_error': abort_on_error, 'advanced': advanced, 'use_js': use_js, @@ -576,6 +591,10 @@ def check_dash_result(validator: Validator, report_graph: GraphLike, expected_re gv_res = None if len(inf_test_cases) > 0: data_graph = validator.target_graph + if isinstance(data_graph, (rdflib.ConjunctiveGraph, rdflib.Dataset)): + named_graphs = list(data_graph.contexts()) + else: + named_graphs = [data_graph] inf_res: Union[bool, None] = True for test_case in inf_test_cases: expected_results = expected_result_graph.objects(test_case, DASH_expectedResult) @@ -584,7 +603,10 @@ def check_dash_result(validator: Validator, report_graph: GraphLike, expected_re raise ReportableRuntimeError( "Cannot check the expected result, the given InferencingTestCase does not have an expectedResult." ) - inf_res = inf_res and compare_inferencing_reports(data_graph, expected_result_graph, expected_results) + found = False + for g in named_graphs: + found = found or compare_inferencing_reports(g, expected_result_graph, expected_results) + inf_res = inf_res and found else: inf_res = None if len(fn_test_cases) > 0: diff --git a/test/test_extra.py b/test/test_extra.py index ac84850..185a0c4 100644 --- a/test/test_extra.py +++ b/test/test_extra.py @@ -6,7 +6,7 @@ # are added as required. import os import re - +from rdflib import Graph from pyshacl import validate from pyshacl.errors import ReportableRuntimeError @@ -124,12 +124,28 @@ """ def test_validate_with_ontology(): - res = validate(data_file_text, shacl_graph=shacl_file_text, - data_graph_format='turtle', shacl_graph_format='turtle', - ont_graph=ontology_file_text, ont_graph_format="turtle", - inference='both', debug=True) + g = Graph().parse(data=data_file_text, format='turtle') + e = Graph().parse(data=ontology_file_text, format='turtle') + g_len = len(g) + res = validate(g, shacl_graph=shacl_file_text, + shacl_graph_format='turtle', + ont_graph=e, inference='both', debug=True) + conforms, graph, string = res + g_len2 = len(g) + assert conforms + assert g_len2 == g_len + +def test_validate_with_ontology_inplace(): + g = Graph().parse(data=data_file_text, format='turtle') + e = Graph().parse(data=ontology_file_text, format='turtle') + g_len = len(g) + res = validate(g, shacl_graph=shacl_file_text, + shacl_graph_format='turtle', + ont_graph=e, inference='both', debug=True, inplace=True) conforms, graph, string = res + g_len2 = len(g) assert conforms + assert g_len2 != g_len def test_validate_with_ontology_fail1(): res = validate(data_file_text_bad, shacl_graph=shacl_file_text, diff --git a/test/test_js/test_js_constraint.py b/test/test_js/test_js_constraint.py index 42458b6..21db4b1 100644 --- a/test/test_js/test_js_constraint.py +++ b/test/test_js/test_js_constraint.py @@ -1,5 +1,5 @@ from rdflib import Graph -from pyshacl import validate +from pyshacl import validate, extras shapes_graph = '''\ @prefix rdf: . @prefix rdfs: . @@ -31,6 +31,8 @@ ex:germanLabel "Spain"@en . ''' +extras.dev_mode = True + def test_js_constraint(): s1 = Graph().parse(data=shapes_graph, format="turtle") g1 = Graph().parse(data=data_graph, format="turtle")