Merge pull request #55 from Shoobx/alga-tag-uniqueattrs

Allow specifying a pair of (tag, attr) in uniqueattrs.
Shoobx · Oct 8, 2019 · ec579f0 · ec579f0
2 parents 9bfd301 + a8adb28
commit ec579f0
Show file tree

Hide file tree

Showing 6 changed files with 134 additions and 13 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -4,8 +4,9 @@ Changes
 2.4 (unreleased)
 ----------------
 
-- Nothing changed yet.
-
+- Added an option to pass pairs of (element, attr) as unique
+  attributes for tree matching.  Exposed this option on the command
+  line, too.
 
 2.3 (2019-02-27)
 ----------------

diff --git a/README.rst b/README.rst
@@ -82,5 +82,7 @@ Contributors
 
  * Stephan Richter, srichter@shoobx.com
 
+ * Albertas Agejevas, alga@shoobx.com
+
 The diff algorithm is based on "`Change Detection in Hierarchically Structured Information <http://ilpubs.stanford.edu/115/1/1995-46.pdf>`_",
 and the text diff is using Google's ``diff_match_patch`` algorithm.
diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -98,7 +98,7 @@ The included formatters, ``diff``, ``xml``, and ``old`` all return a Unicode str
 and no guarantees are done that the output of one version will be the same as the output of any previous version.
 The actions of the edit script can be in a different order or replaced by equivalent actions dependingon the version of ``xmldiff``,
 but if the Edit Script does not correctly transform one XML tree into another,
-thas is regarded as a bug.
+that is regarded as a bug.
 This means that the output of the ``xml`` format also may change from version to version.
 There is no "correct" solution to how that output should look,
 as the same change can be represented in several different ways.
@@ -107,11 +107,17 @@ as the same change can be represented in several different ways.
 Unique Attributes
 -----------------
 
-The ``uniqueattrs`` argument is a list of strings specifying attributes that uniquely identify a node in the document.
+The ``uniqueattrs`` argument is a list of strings or ``(tag, attribute)`` tuples
+specifying attributes that uniquely identify a node in the document.
 This is used by the differ when trying to match nodes.
 If one node in the left tree has a this attribute,
 the node in the right three with the same value for that attribute will match,
 regardless of other attributes, child nodes or text content.
+Respectively, if the values of the attribute on the nodes in question are different,
+or if only one of the nodes has this attribute,
+the nodes will not match regardless of their structural similarity.
+In case the attribute is a tuple, the attribute match applies only if both nodes
+have the given tag.
 
 The default is ``['{http://www.w3.org/XML/1998/namespace}id']``,
 which is the ``xml:id`` attribute.

diff --git a/tests/test_diff.py b/tests/test_diff.py
@@ -13,6 +13,20 @@
 from .testing import compare_elements
 
 
+def dedent(string):
+    """Remove the maximum common indent of the lines making up the string."""
+    lines = string.splitlines()
+    indent = min(
+        len(line) - len(line.lstrip())
+        for line in lines
+        if line
+    )
+    return "\n".join(
+        line[indent:] if line else line
+        for line in lines
+    )
+
+
 class APITests(unittest.TestCase):
     left = u"<document><p>Text</p><p>More</p></document>"
     right = u"<document><p>Tokst</p><p>More</p></document>"
@@ -302,6 +316,91 @@ def test_compare_with_xmlid(self):
         self.assertEqual(differ.child_ratio(left, right), 1.0)
         self.assertEqual(differ.node_ratio(left, right), 0)
 
+    def test_compare_with_uniqueattrs(self):
+        # `uniqueattrs` can be pairs of (tag, attribute) as well as just string
+        # attributes.
+        left = dedent(u"""\
+        <document>
+            <story firstPageTemplate="FirstPage">
+                <section name="oldfirst" ref="1" single-ref="1">
+                    <para>First paragraph</para>
+                    <para>This is the second paragraph</para>
+                </section>
+                <section ref="3" single-ref="3" name="tobedeleted">
+                    <para>Det tredje stycket</para>
+                </section>
+                <section name="last" ref="4" single-ref="4">
+                    <para>Last paragraph</para>
+                </section>
+            </story>
+        </document>
+        """)
+
+        right = dedent(u"""\
+        <document>
+            <story firstPageTemplate="FirstPage">
+                <section name="newfirst" ref="1" single-ref="1">
+                    <para>First paragraph</para>
+                </section>
+                <section name="oldfirst" single-ref="2" ref="2">
+                    <para>This is the second</para>
+                    <para>Det tredje stycket</para>
+                </section>
+                <section single-ref="4" ref="4">
+                    <para>Last paragraph</para>
+                </section>
+                <subsection name="oldfirst" ref="1" single-ref="1">
+                    <para>First paragraph</para>
+                    <para>This is the second paragraph</para>
+                </subsection>
+            </story>
+        </document>
+        """)
+
+        differ = Differ(uniqueattrs=[
+            ('section', 'name'),
+            '{http://www.w3.org/XML/1998/namespace}id'
+        ])
+        differ.set_trees(etree.fromstring(left), etree.fromstring(right))
+        differ.match()
+
+        # Make some choice comparisons here.
+
+        left = differ.left.xpath('/document/story/section[1]')[0]
+        right = differ.right.xpath('/document/story/section[1]')[0]
+
+        # These are very similar
+        self.assertEqual(differ.leaf_ratio(left, right), 0.90625)
+        # And one out of two children in common
+        self.assertEqual(differ.child_ratio(left, right), 0.5)
+        # But different names, hence 0 as match
+        self.assertEqual(differ.node_ratio(left, right), 0)
+
+        # Here's the ones with the same tag and name attribute:
+        left = differ.left.xpath('/document/story/section[1]')[0]
+        right = differ.right.xpath('/document/story/section[2]')[0]
+
+        # Only one out of two children in common
+        self.assertEqual(differ.child_ratio(left, right), 0)
+        # But same id's, hence 1 as match
+        self.assertEqual(differ.node_ratio(left, right), 1.0)
+
+        # The last ones are completely similar, but only one
+        # has an name, so they do not match.
+        left = differ.left.xpath('/document/story/section[3]')[0]
+        right = differ.right.xpath('/document/story/section[3]')[0]
+        self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.78260869565)
+        self.assertEqual(differ.child_ratio(left, right), 1.0)
+        self.assertEqual(differ.node_ratio(left, right), 0)
+
+        # Now these are structurally similar, have the same name, but
+        # one of them is not a section, so the uniqueattr does not match
+        left = differ.left.xpath('/document/story/section[1]')[0]
+        right = differ.right.xpath('/document/story/subsection[1]')[0]
+        self.assertAlmostEqual(differ.leaf_ratio(left, right), 1.0)
+        self.assertEqual(differ.child_ratio(left, right), 0.5)
+        self.assertAlmostEqual(differ.node_ratio(left, right), 0.75)
+
     def test_compare_node_rename(self):
         left = u"""<document>
   <para>First paragraph</para>

diff --git a/xmldiff/diff.py b/xmldiff/diff.py
@@ -14,8 +14,9 @@ def __init__(self, F=None, uniqueattrs=None, ratio_mode='fast',
         if F is None:
             F = 0.5
         self.F = F
-        # uniqueattrs is a list of attributes that uniquely identifies a node
-        # inside a document. Defaults to 'xml:id'.
+        # uniqueattrs is a list of attributes or (tag, attribute) pairs
+        # that uniquely identifies a node inside a document. Defaults
+        # to 'xml:id'.
         if uniqueattrs is None:
             uniqueattrs = ['{http://www.w3.org/XML/1998/namespace}id']
         self.uniqueattrs = uniqueattrs
@@ -162,6 +163,12 @@ def node_ratio(self, left, right):
             return 0
 
         for attr in self.uniqueattrs:
+            if not isinstance(attr, str):
+                # If it's actually a sequence of (tag, attr), the tags must
+                # match first.
+                tag, attr = attr
+                if tag != left.tag or tag != right.tag:
+                    continue
             if attr in left.attrib or attr in right.attrib:
                 # One of the nodes have a unique attribute, we check only that.
                 # If only one node has it, it means they are not the same.

diff --git a/xmldiff/main.py b/xmldiff/main.py
@@ -76,7 +76,9 @@ def make_diff_parser():
     parser.add_argument('--unique-attributes', type=str, nargs='?',
                         default='{http://www.w3.org/XML/1998/namespace}id',
                         help='A comma separated list of attributes '
-                             'that uniquely identify a node. Can be empty.')
+                             'that uniquely identify a node. Can be empty. '
+                             'Unique attributes for certain elements can '
+                             'be specified in the format {NS}element@attr.')
     parser.add_argument('--ratio-mode', default='fast',
                         choices={'accurate', 'fast', 'faster'},
                         help='Choose the node comparison optimization.')
@@ -85,6 +87,15 @@ def make_diff_parser():
     return parser
 
 
+def _parse_uniqueattrs(uniqueattrs):
+    if uniqueattrs is None:
+        return []
+    return [
+        attr if '@' not in attr else attr.split('@', 1)
+        for attr in uniqueattrs.split(',')
+    ]
+
+
 def diff_command(args=None):
     parser = make_diff_parser()
     args = parser.parse_args(args=args)
@@ -97,15 +108,10 @@ def diff_command(args=None):
     formatter = FORMATTERS[args.formatter](normalize=normalize,
                                            pretty_print=args.pretty_print)
 
-    if args.unique_attributes is None:
-        uniqueattrs = []
-    else:
-        uniqueattrs = args.unique_attributes.split(',')
-
     diff_options = {'ratio_mode': args.ratio_mode,
                     'F': args.F,
                     'fast_match': args.fast_match,
-                    'uniqueattrs': uniqueattrs,
+                    'uniqueattrs': _parse_uniqueattrs(args.unique_attributes),
                     }
     result = diff_files(args.file1, args.file2, diff_options=diff_options,
                         formatter=formatter)