From f39cf487b3e9009401158b8400764996d8efb4e9 Mon Sep 17 00:00:00 2001 From: Albertas Agejevas Date: Tue, 1 Oct 2019 18:55:45 +0300 Subject: [PATCH 1/3] Allow specifying a tag and an attr in uniqueattrs. --- CHANGES.rst | 5 ++- README.rst | 2 + tests/test_diff.py | 99 ++++++++++++++++++++++++++++++++++++++++++++++ xmldiff/diff.py | 11 +++++- xmldiff/main.py | 9 ++++- 5 files changed, 120 insertions(+), 6 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index ad5c8c0..d2d52c3 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -4,8 +4,9 @@ Changes 2.4 (unreleased) ---------------- -- Nothing changed yet. - +- Added an option to pass pairs of (element, attr) as unique + attributes for tree matching. Exposed this option on the command + line, too. 2.3 (2019-02-27) ---------------- diff --git a/README.rst b/README.rst index 033d14e..5395a9d 100644 --- a/README.rst +++ b/README.rst @@ -82,5 +82,7 @@ Contributors * Stephan Richter, srichter@shoobx.com + * Albertas Agejevas, alga@shoobx.com + The diff algorithm is based on "`Change Detection in Hierarchically Structured Information `_", and the text diff is using Google's ``diff_match_patch`` algorithm. diff --git a/tests/test_diff.py b/tests/test_diff.py index 1a70b79..13855c7 100644 --- a/tests/test_diff.py +++ b/tests/test_diff.py @@ -13,6 +13,20 @@ from .testing import compare_elements +def dedent(string): + """Remove the maximum common indent of the lines making up the string.""" + lines = string.splitlines() + indent = min( + len(line) - len(line.lstrip()) + for line in lines + if line + ) + return "\n".join( + line[indent:] if line else line + for line in lines + ) + + class APITests(unittest.TestCase): left = u"

Text

More

" right = u"

Tokst

More

" @@ -302,6 +316,91 @@ def test_compare_with_xmlid(self): self.assertEqual(differ.child_ratio(left, right), 1.0) self.assertEqual(differ.node_ratio(left, right), 0) + def test_compare_with_uniqueattrs(self): + # `uniqueattrs` can be pairs of (tag, attribute) as well as just string + # attributes. + left = dedent(u"""\ + + +
+ First paragraph + This is the second paragraph +
+
+ Det tredje stycket +
+
+ Last paragraph +
+
+
+ """) + + right = dedent(u"""\ + + +
+ First paragraph +
+
+ This is the second + Det tredje stycket +
+
+ Last paragraph +
+ + First paragraph + This is the second paragraph + +
+
+ """) + + differ = Differ(uniqueattrs=[ + ('section', 'name'), + '{http://www.w3.org/XML/1998/namespace}id' + ]) + differ.set_trees(etree.fromstring(left), etree.fromstring(right)) + differ.match() + + # Make some choice comparisons here. + + left = differ.left.xpath('/document/story/section[1]')[0] + right = differ.right.xpath('/document/story/section[1]')[0] + + # These are very similar + self.assertEqual(differ.leaf_ratio(left, right), 0.90625) + # And one out of two children in common + self.assertEqual(differ.child_ratio(left, right), 0.5) + # But different names, hence 0 as match + self.assertEqual(differ.node_ratio(left, right), 0) + + # Here's the ones with the same tag and name attribute: + left = differ.left.xpath('/document/story/section[1]')[0] + right = differ.right.xpath('/document/story/section[2]')[0] + + # Only one out of two children in common + self.assertEqual(differ.child_ratio(left, right), 0) + # But same id's, hence 1 as match + self.assertEqual(differ.node_ratio(left, right), 1.0) + + # The last ones are completely similar, but only one + # has an name, so they do not match. + left = differ.left.xpath('/document/story/section[3]')[0] + right = differ.right.xpath('/document/story/section[3]')[0] + self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.78260869565) + self.assertEqual(differ.child_ratio(left, right), 1.0) + self.assertEqual(differ.node_ratio(left, right), 0) + + # Now these are structurally similar, have the same name, but + # one of them is not a section, so the uniqueattr does not match + left = differ.left.xpath('/document/story/section[1]')[0] + right = differ.right.xpath('/document/story/subsection[1]')[0] + self.assertAlmostEqual(differ.leaf_ratio(left, right), 1.0) + self.assertEqual(differ.child_ratio(left, right), 0.5) + self.assertAlmostEqual(differ.node_ratio(left, right), 0.75) + def test_compare_node_rename(self): left = u""" First paragraph diff --git a/xmldiff/diff.py b/xmldiff/diff.py index 95d5eb5..a638dd7 100644 --- a/xmldiff/diff.py +++ b/xmldiff/diff.py @@ -14,8 +14,9 @@ def __init__(self, F=None, uniqueattrs=None, ratio_mode='fast', if F is None: F = 0.5 self.F = F - # uniqueattrs is a list of attributes that uniquely identifies a node - # inside a document. Defaults to 'xml:id'. + # uniqueattrs is a list of attributes or (tag, attribute) pairs + # that uniquely identifies a node inside a document. Defaults + # to 'xml:id'. if uniqueattrs is None: uniqueattrs = ['{http://www.w3.org/XML/1998/namespace}id'] self.uniqueattrs = uniqueattrs @@ -162,6 +163,12 @@ def node_ratio(self, left, right): return 0 for attr in self.uniqueattrs: + if not isinstance(attr, str): + # If it's actually a sequence of (tag, attr), the tags must + # match first. + tag, attr = attr + if tag != left.tag or tag != right.tag: + continue if attr in left.attrib or attr in right.attrib: # One of the nodes have a unique attribute, we check only that. # If only one node has it, it means they are not the same. diff --git a/xmldiff/main.py b/xmldiff/main.py index 639532c..44ea4ec 100644 --- a/xmldiff/main.py +++ b/xmldiff/main.py @@ -76,7 +76,9 @@ def make_diff_parser(): parser.add_argument('--unique-attributes', type=str, nargs='?', default='{http://www.w3.org/XML/1998/namespace}id', help='A comma separated list of attributes ' - 'that uniquely identify a node. Can be empty.') + 'that uniquely identify a node. Can be empty. ' + 'Unique attributes for certain elements can ' + 'be specified in the format {NS}element@attr.') parser.add_argument('--ratio-mode', default='fast', choices={'accurate', 'fast', 'faster'}, help='Choose the node comparison optimization.') @@ -100,7 +102,10 @@ def diff_command(args=None): if args.unique_attributes is None: uniqueattrs = [] else: - uniqueattrs = args.unique_attributes.split(',') + uniqueattrs = [ + attr if '@' not in attr else attr.split('@', 1) + for attr in args.unique_attributes.split(',') + ] diff_options = {'ratio_mode': args.ratio_mode, 'F': args.F, From 2bb68c0793f4da3dcae9b81b68cd9b7f572928a4 Mon Sep 17 00:00:00 2001 From: Albertas Agejevas Date: Mon, 7 Oct 2019 19:00:18 +0300 Subject: [PATCH 2/3] Extract _parse_uniqueattrs to please codeclimate. --- xmldiff/main.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/xmldiff/main.py b/xmldiff/main.py index 44ea4ec..9ca0f7f 100644 --- a/xmldiff/main.py +++ b/xmldiff/main.py @@ -87,6 +87,15 @@ def make_diff_parser(): return parser +def _parse_uniqueattrs(uniqueattrs): + if uniqueattrs is None: + return [] + return [ + attr if '@' not in attr else attr.split('@', 1) + for attr in uniqueattrs.split(',') + ] + + def diff_command(args=None): parser = make_diff_parser() args = parser.parse_args(args=args) @@ -99,18 +108,10 @@ def diff_command(args=None): formatter = FORMATTERS[args.formatter](normalize=normalize, pretty_print=args.pretty_print) - if args.unique_attributes is None: - uniqueattrs = [] - else: - uniqueattrs = [ - attr if '@' not in attr else attr.split('@', 1) - for attr in args.unique_attributes.split(',') - ] - diff_options = {'ratio_mode': args.ratio_mode, 'F': args.F, 'fast_match': args.fast_match, - 'uniqueattrs': uniqueattrs, + 'uniqueattrs': _parse_uniqueattrs(args.unique_attributes), } result = diff_files(args.file1, args.file2, diff_options=diff_options, formatter=formatter) From a8adb2896e9b6aab8bb691b473842fc94156391e Mon Sep 17 00:00:00 2001 From: Albertas Agejevas Date: Mon, 7 Oct 2019 20:43:23 +0300 Subject: [PATCH 3/3] Document the changes to uniqueattrs in the API. --- docs/source/api.rst | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/docs/source/api.rst b/docs/source/api.rst index 6c252ec..4e37149 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -98,7 +98,7 @@ The included formatters, ``diff``, ``xml``, and ``old`` all return a Unicode str and no guarantees are done that the output of one version will be the same as the output of any previous version. The actions of the edit script can be in a different order or replaced by equivalent actions dependingon the version of ``xmldiff``, but if the Edit Script does not correctly transform one XML tree into another, -thas is regarded as a bug. +that is regarded as a bug. This means that the output of the ``xml`` format also may change from version to version. There is no "correct" solution to how that output should look, as the same change can be represented in several different ways. @@ -107,11 +107,17 @@ as the same change can be represented in several different ways. Unique Attributes ----------------- -The ``uniqueattrs`` argument is a list of strings specifying attributes that uniquely identify a node in the document. +The ``uniqueattrs`` argument is a list of strings or ``(tag, attribute)`` tuples +specifying attributes that uniquely identify a node in the document. This is used by the differ when trying to match nodes. If one node in the left tree has a this attribute, the node in the right three with the same value for that attribute will match, regardless of other attributes, child nodes or text content. +Respectively, if the values of the attribute on the nodes in question are different, +or if only one of the nodes has this attribute, +the nodes will not match regardless of their structural similarity. +In case the attribute is a tuple, the attribute match applies only if both nodes +have the given tag. The default is ``['{http://www.w3.org/XML/1998/namespace}id']``, which is the ``xml:id`` attribute.