Skip to content

Commit

Permalink
Merge pull request #55 from Shoobx/alga-tag-uniqueattrs
Browse files Browse the repository at this point in the history
Allow specifying a pair of  (tag, attr) in uniqueattrs.
  • Loading branch information
alga committed Oct 8, 2019
2 parents 9bfd301 + a8adb28 commit ec579f0
Show file tree
Hide file tree
Showing 6 changed files with 134 additions and 13 deletions.
5 changes: 3 additions & 2 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@ Changes
2.4 (unreleased)
----------------

- Nothing changed yet.

- Added an option to pass pairs of (element, attr) as unique
attributes for tree matching. Exposed this option on the command
line, too.

2.3 (2019-02-27)
----------------
Expand Down
2 changes: 2 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -82,5 +82,7 @@ Contributors

* Stephan Richter, srichter@shoobx.com

* Albertas Agejevas, alga@shoobx.com

The diff algorithm is based on "`Change Detection in Hierarchically Structured Information <http://ilpubs.stanford.edu/115/1/1995-46.pdf>`_",
and the text diff is using Google's ``diff_match_patch`` algorithm.
10 changes: 8 additions & 2 deletions docs/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ The included formatters, ``diff``, ``xml``, and ``old`` all return a Unicode str
and no guarantees are done that the output of one version will be the same as the output of any previous version.
The actions of the edit script can be in a different order or replaced by equivalent actions dependingon the version of ``xmldiff``,
but if the Edit Script does not correctly transform one XML tree into another,
thas is regarded as a bug.
that is regarded as a bug.
This means that the output of the ``xml`` format also may change from version to version.
There is no "correct" solution to how that output should look,
as the same change can be represented in several different ways.
Expand All @@ -107,11 +107,17 @@ as the same change can be represented in several different ways.
Unique Attributes
-----------------

The ``uniqueattrs`` argument is a list of strings specifying attributes that uniquely identify a node in the document.
The ``uniqueattrs`` argument is a list of strings or ``(tag, attribute)`` tuples
specifying attributes that uniquely identify a node in the document.
This is used by the differ when trying to match nodes.
If one node in the left tree has a this attribute,
the node in the right three with the same value for that attribute will match,
regardless of other attributes, child nodes or text content.
Respectively, if the values of the attribute on the nodes in question are different,
or if only one of the nodes has this attribute,
the nodes will not match regardless of their structural similarity.
In case the attribute is a tuple, the attribute match applies only if both nodes
have the given tag.

The default is ``['{http://www.w3.org/XML/1998/namespace}id']``,
which is the ``xml:id`` attribute.
Expand Down
99 changes: 99 additions & 0 deletions tests/test_diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,20 @@
from .testing import compare_elements


def dedent(string):
"""Remove the maximum common indent of the lines making up the string."""
lines = string.splitlines()
indent = min(
len(line) - len(line.lstrip())
for line in lines
if line
)
return "\n".join(
line[indent:] if line else line
for line in lines
)


class APITests(unittest.TestCase):
left = u"<document><p>Text</p><p>More</p></document>"
right = u"<document><p>Tokst</p><p>More</p></document>"
Expand Down Expand Up @@ -302,6 +316,91 @@ def test_compare_with_xmlid(self):
self.assertEqual(differ.child_ratio(left, right), 1.0)
self.assertEqual(differ.node_ratio(left, right), 0)

def test_compare_with_uniqueattrs(self):
# `uniqueattrs` can be pairs of (tag, attribute) as well as just string
# attributes.
left = dedent(u"""\
<document>
<story firstPageTemplate="FirstPage">
<section name="oldfirst" ref="1" single-ref="1">
<para>First paragraph</para>
<para>This is the second paragraph</para>
</section>
<section ref="3" single-ref="3" name="tobedeleted">
<para>Det tredje stycket</para>
</section>
<section name="last" ref="4" single-ref="4">
<para>Last paragraph</para>
</section>
</story>
</document>
""")

right = dedent(u"""\
<document>
<story firstPageTemplate="FirstPage">
<section name="newfirst" ref="1" single-ref="1">
<para>First paragraph</para>
</section>
<section name="oldfirst" single-ref="2" ref="2">
<para>This is the second</para>
<para>Det tredje stycket</para>
</section>
<section single-ref="4" ref="4">
<para>Last paragraph</para>
</section>
<subsection name="oldfirst" ref="1" single-ref="1">
<para>First paragraph</para>
<para>This is the second paragraph</para>
</subsection>
</story>
</document>
""")

differ = Differ(uniqueattrs=[
('section', 'name'),
'{http://www.w3.org/XML/1998/namespace}id'
])
differ.set_trees(etree.fromstring(left), etree.fromstring(right))
differ.match()

# Make some choice comparisons here.

left = differ.left.xpath('/document/story/section[1]')[0]
right = differ.right.xpath('/document/story/section[1]')[0]

# These are very similar
self.assertEqual(differ.leaf_ratio(left, right), 0.90625)
# And one out of two children in common
self.assertEqual(differ.child_ratio(left, right), 0.5)
# But different names, hence 0 as match
self.assertEqual(differ.node_ratio(left, right), 0)

# Here's the ones with the same tag and name attribute:
left = differ.left.xpath('/document/story/section[1]')[0]
right = differ.right.xpath('/document/story/section[2]')[0]

# Only one out of two children in common
self.assertEqual(differ.child_ratio(left, right), 0)
# But same id's, hence 1 as match
self.assertEqual(differ.node_ratio(left, right), 1.0)

# The last ones are completely similar, but only one
# has an name, so they do not match.
left = differ.left.xpath('/document/story/section[3]')[0]
right = differ.right.xpath('/document/story/section[3]')[0]
self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.78260869565)
self.assertEqual(differ.child_ratio(left, right), 1.0)
self.assertEqual(differ.node_ratio(left, right), 0)

# Now these are structurally similar, have the same name, but
# one of them is not a section, so the uniqueattr does not match
left = differ.left.xpath('/document/story/section[1]')[0]
right = differ.right.xpath('/document/story/subsection[1]')[0]
self.assertAlmostEqual(differ.leaf_ratio(left, right), 1.0)
self.assertEqual(differ.child_ratio(left, right), 0.5)
self.assertAlmostEqual(differ.node_ratio(left, right), 0.75)

def test_compare_node_rename(self):
left = u"""<document>
<para>First paragraph</para>
Expand Down
11 changes: 9 additions & 2 deletions xmldiff/diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@ def __init__(self, F=None, uniqueattrs=None, ratio_mode='fast',
if F is None:
F = 0.5
self.F = F
# uniqueattrs is a list of attributes that uniquely identifies a node
# inside a document. Defaults to 'xml:id'.
# uniqueattrs is a list of attributes or (tag, attribute) pairs
# that uniquely identifies a node inside a document. Defaults
# to 'xml:id'.
if uniqueattrs is None:
uniqueattrs = ['{http://www.w3.org/XML/1998/namespace}id']
self.uniqueattrs = uniqueattrs
Expand Down Expand Up @@ -162,6 +163,12 @@ def node_ratio(self, left, right):
return 0

for attr in self.uniqueattrs:
if not isinstance(attr, str):
# If it's actually a sequence of (tag, attr), the tags must
# match first.
tag, attr = attr
if tag != left.tag or tag != right.tag:
continue
if attr in left.attrib or attr in right.attrib:
# One of the nodes have a unique attribute, we check only that.
# If only one node has it, it means they are not the same.
Expand Down
20 changes: 13 additions & 7 deletions xmldiff/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,9 @@ def make_diff_parser():
parser.add_argument('--unique-attributes', type=str, nargs='?',
default='{http://www.w3.org/XML/1998/namespace}id',
help='A comma separated list of attributes '
'that uniquely identify a node. Can be empty.')
'that uniquely identify a node. Can be empty. '
'Unique attributes for certain elements can '
'be specified in the format {NS}element@attr.')
parser.add_argument('--ratio-mode', default='fast',
choices={'accurate', 'fast', 'faster'},
help='Choose the node comparison optimization.')
Expand All @@ -85,6 +87,15 @@ def make_diff_parser():
return parser


def _parse_uniqueattrs(uniqueattrs):
if uniqueattrs is None:
return []
return [
attr if '@' not in attr else attr.split('@', 1)
for attr in uniqueattrs.split(',')
]


def diff_command(args=None):
parser = make_diff_parser()
args = parser.parse_args(args=args)
Expand All @@ -97,15 +108,10 @@ def diff_command(args=None):
formatter = FORMATTERS[args.formatter](normalize=normalize,
pretty_print=args.pretty_print)

if args.unique_attributes is None:
uniqueattrs = []
else:
uniqueattrs = args.unique_attributes.split(',')

diff_options = {'ratio_mode': args.ratio_mode,
'F': args.F,
'fast_match': args.fast_match,
'uniqueattrs': uniqueattrs,
'uniqueattrs': _parse_uniqueattrs(args.unique_attributes),
}
result = diff_files(args.file1, args.file2, diff_options=diff_options,
formatter=formatter)
Expand Down

0 comments on commit ec579f0

Please sign in to comment.