Fast match algorithm

Less ideal matches, but faster (and not per se *wrong*)
Shoobx · Sep 27, 2018 · e12d6f1 · e12d6f1
1 parent bc09742
commit e12d6f1
Show file tree

Hide file tree

Showing 8 changed files with 274 additions and 34 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -7,6 +7,10 @@ Changes
 - Added options for faster node comparisons. The "middle" option is now
   default, it had very few changes in matches, but is much faster.
 
+- Implemented a Fast Match algorithm for even faster diffing.
+
+- Fixed a bug where MoveNode actions sometimes was in the wrong order
+
 
 2.0 (2018-09-25)
 ----------------

diff --git a/docs/source/advanced.rst b/docs/source/advanced.rst
@@ -89,15 +89,22 @@ This requires subclassing the formatter:
   >>> XSLT = u'''<?xml version="1.0"?>
   ... <xsl:stylesheet version="1.0"
   ...    xmlns:diff="http://namespaces.shoobx.com/diff"
-  ...    xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
-  ...    xmlns="http://www.w3.org/1999/xhtml">
+  ...    xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
   ...
   ...    <xsl:template match="@diff:insert-formatting">
   ...        <xsl:attribute name="class">
   ...          <xsl:value-of select="'insert-formatting'"/>
   ...        </xsl:attribute>
   ...    </xsl:template>
   ...
+  ...    <xsl:template match="diff:delete">
+  ...        <del><xsl:apply-templates /></del>
+  ...    </xsl:template>
+  ...
+  ...    <xsl:template match="diff:insert">
+  ...        <ins><xsl:apply-templates /></ins>
+  ...    </xsl:template>
+  ...
   ...    <xsl:template match="@* | node()">
   ...      <xsl:copy>
   ...        <xsl:apply-templates select="@* | node()"/>
@@ -111,8 +118,8 @@ This requires subclassing the formatter:
   ...         result = transform(result)
   ...         return super(HTMLFormatter, self).render(result)
 
-The XSLT template above of course only handles one case,
-inserted formatting.
+The XSLT template above of course only handles a few cases,
+like inserted formatting and insert and delete tags (used below).
 A more complete XSLT file is included `here <file:_static/htmlformatter.xslt>`_.
 
 Now use that formatter in the diffing:
@@ -134,3 +141,95 @@ You can then add into your CSS files classes that make inserted text green,
 deleted text red with an overstrike,
 and formatting changes could for example be blue.
 This makes it easy to see what has been changed in a HTML document.
+
+
+Performance Options
+-------------------
+
+The performance options available will not just change the performance,
+but can also change the result.
+It's not always so that the result is worse,
+it's just less accurate.
+In some cases the less accurate result might actually be preferrable.
+As an example we take the following HTML codes:
+
+
+.. doctest::
+  :options: -ELLIPSIS, +NORMALIZE_WHITESPACE
+
+  >>> left = u"""<html><body>
+  ...   <p>The First paragraph</p>
+  ...   <p>A Second paragraph</p>
+  ...   <p>Last paragraph</p>
+  ... </body></html>"""
+  >>> right = u"""<html><body>
+  ...   <p>Last paragraph</p>
+  ...   <p>A Second paragraph</p>
+  ...   <p>The First paragraph</p>
+  ... </body></html>"""
+  >>> result = main.diff_texts(left, right)
+  >>> result
+  [MoveNode(node='/html/body/p[1]', target='/html/body[1]', position=2),
+   MoveNode(node='/html/body/p[1]', target='/html/body[1]', position=1)]
+
+We here see that the differ finds that two paragraphs needs to be moved.
+Don't be confused that it says ``p[1]`` in both cases.
+That just means to move the first paragraph,
+and in the second case that first paragraph has already been moved and is now last.
+
+If we format that diff to XML with the XMLFormatter,
+we get output that marks these paragraphs as deleted and then inserted later.
+
+.. doctest::
+  :options: -ELLIPSIS, +NORMALIZE_WHITESPACE
+
+  >>> formatter = HTMLFormatter(
+  ...     normalize=formatting.WS_BOTH)
+  >>> result = main.diff_texts(left, right, formatter=formatter)
+  >>> print(result)
+  <html xmlns:diff="http://namespaces.shoobx.com/diff">
+    <body>
+      <p diff:delete="">The First paragraph</p>
+      <p diff:delete="">A Second paragraph</p>
+      <p>Last paragraph</p>
+      <p diff:insert="">A Second paragraph</p>
+      <p diff:insert="">The First paragraph</p>
+    </body>
+  </html>
+
+Let's try diffing the same HTML with the fast match algorithm:
+
+.. doctest::
+  :options: -ELLIPSIS, +NORMALIZE_WHITESPACE
+
+  >>> result = main.diff_texts(left, right,
+  ...     diff_options={'fast_match': True})
+  >>> result
+  [UpdateTextIn(node='/html/body/p[1]', text='Last paragraph'),
+   UpdateTextIn(node='/html/body/p[3]', text='The First paragraph')]
+
+Now we instead got two update actions.
+This means the resulting HTML is quite different:
+
+.. doctest::
+  :options: -ELLIPSIS, +NORMALIZE_WHITESPACE
+
+  >>> result = main.diff_texts(left, right,
+  ...     diff_options={'fast_match': True},
+  ...     formatter=formatter)
+  >>> print(result)
+  <html xmlns:diff="http://namespaces.shoobx.com/diff">
+    <body>
+      <p><del>The Fir</del><ins>La</ins>st paragraph</p>
+      <p>A Second paragraph</p>
+      <p><del>La</del><ins>The Fir</ins>st paragraph</p>
+    </body>
+  </html>
+
+The texts are updated instead of deleting and then reinserting the whole paragraphs.
+This makes the visual output more readable.
+Also note that the XSLT in this case replaced the ``<diff:insert>`` and ``<diff:delete>`` tags with ``<ins>`` and ``<del>`` tags.
+
+This is a contrived example, though.
+If you are using ``xmldiff`` to generate a visual diff,
+you have to experiment with performance flags to find the best combination of speed and output for your case.
diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -68,6 +68,18 @@ Parameters
     Using ``'accurate'`` will be significantly slower,
     especially if your nodes have long texts or many attributes.
 
+``fast_match``:
+  By default ``xmldiff`` will compare each node from one tree with all nodes from the other tree.
+  It will then pick the one node that matches best as the match,
+  if that match passes the match treshold ``F`` (see above).
+
+  If fast_match is true ``xmldiff`` will first make a faster run,
+  trying to find chains of matching nodes,
+  during which any match better than ``F`` will count.
+  This significantly cuts down on the time to match nodes,
+  but means that the matches are no longer the best match,
+  only "good enough" matches.
+
 ``formatter``:
   The formatter to use, see `Using Formatters`_.
   If no formatter is specified the function will return a list of edit actions,

diff --git a/tests/test_data/all_actions.expected.xml b/tests/test_data/all_actions.expected.xml
@@ -7,10 +7,10 @@
   </tag>
   <node diff:delete="">
     Here we have some text.
-  </node><new diff:insert=""/><nod diff:insert="">
+  </node><nod diff:insert="">
     Here we have some text.
   </nod>
-  <tail diff:delete="">
+  <new diff:insert=""/><tail diff:delete="">
     My last tag
   </tail>
 </document>
diff --git a/tests/test_diff.py b/tests/test_diff.py
@@ -710,6 +710,104 @@ def test_entirely_different(self):
         ])
 
 
+class FastMatchTests(unittest.TestCase):
+
+    def _match(self, left, right, fast_match):
+        left_tree = etree.fromstring(left)
+        right_tree = etree.fromstring(right)
+        differ = Differ(fast_match=fast_match)
+        differ.set_trees(left_tree, right_tree)
+        matches = differ.match()
+        lpath = differ.left.getroottree().getpath
+        rpath = differ.right.getroottree().getpath
+        return [(lpath(item[0]), rpath(item[1])) for item in matches]
+
+    def test_move_paragraph(self):
+        left = u"""<document>
+    <story firstPageTemplate="FirstPage">
+        <section ref="3" single-ref="3">
+            <para>First paragraph</para>
+            <para>Second paragraph</para>
+        </section>
+        <section ref="4" single-ref="4">
+            <para>Last paragraph</para>
+        </section>
+    </story>
+</document>
+"""
+
+        right = u"""<document>
+    <story firstPageTemplate="FirstPage">
+        <section ref="3" single-ref="3">
+            <para>First paragraph</para>
+        </section>
+        <section ref="4" single-ref="4">
+            <para>Second paragraph</para>
+            <para>Last paragraph</para>
+        </section>
+    </story>
+</document>
+"""
+        # Same matches as the non-fast match test, but the matches are
+        # a different order.
+        slow_result = sorted(self._match(left, right, False))
+        fast_result = sorted(self._match(left, right, True))
+        self.assertEqual(slow_result, fast_result)
+
+    def test_move_children(self):
+        # Here the paragraphs are all so similar that that each paragraph
+        # will match any other.
+        left = u"""<document>
+    <story firstPageTemplate="FirstPage">
+        <section ref="3" single-ref="3">
+            <para>First paragraph</para>
+            <para>Second paragraph</para>
+            <para>Last paragraph</para>
+        </section>
+    </story>
+</document>
+"""
+
+        right = u"""<document>
+    <story firstPageTemplate="FirstPage">
+        <section ref="3" single-ref="3">
+            <para>Second paragraph</para>
+            <para>Last paragraph</para>
+            <para>First paragraph</para>
+        </section>
+    </story>
+</document>
+"""
+        # The slow match will match the nodes that match *best*, so it will
+        # find that paragraphs have moved around.
+        slow_result = sorted(self._match(left, right, False))
+        self.assertEqual(slow_result, [
+            ('/document', '/document'),
+            ('/document/story', '/document/story'),
+            ('/document/story/section', '/document/story/section'),
+            ('/document/story/section/para[1]',
+             '/document/story/section/para[3]'),
+            ('/document/story/section/para[2]',
+             '/document/story/section/para[1]'),
+            ('/document/story/section/para[3]',
+             '/document/story/section/para[2]')
+        ])
+
+        # But the fast match will just pick any that matches.
+        fast_result = sorted(self._match(left, right, True))
+        self.assertEqual(fast_result, [
+            ('/document', '/document'),
+            ('/document/story', '/document/story'),
+            ('/document/story/section', '/document/story/section'),
+            ('/document/story/section/para[1]',
+             '/document/story/section/para[1]'),
+            ('/document/story/section/para[2]',
+             '/document/story/section/para[2]'),
+            ('/document/story/section/para[3]',
+             '/document/story/section/para[3]')
+        ])
+
+
 class UpdateNodeTests(unittest.TestCase):
     """Testing only the update phase of the diffing"""
 

diff --git a/tests/test_formatting.py b/tests/test_formatting.py
@@ -462,7 +462,7 @@ def test_all_actions(self):
             u'[update, /document/node[1]/text()[2], "\\n    '
             u'New tail content\\n  "]\n'
             u'[rename, /document/node[2], nod]\n'
-            u'[insert-after, /document/node[2], \n'
+            u'[insert-after, /document/tag[1], \n'
             u'<new/>]\n'
             u'[remove, /document/tail[1]]'
         )