From c0ff692699927741aafe761f75644ac7661b4607 Mon Sep 17 00:00:00 2001 From: Lennart Regebro Date: Fri, 7 Sep 2018 13:49:48 +0200 Subject: [PATCH] Make a HTML snippet differ instead of RML It's more generically useful --- CHANGES.rst | 3 +- MANIFEST.in | 2 +- docs/source/api.rst | 48 +- docs/source/commandline.rst | 11 +- .../complex-text-update.expected.html | 5 + .../complex-text-update.expected.rml | 5 - ...left.rml => complex-text-update.left.html} | 12 +- ...ght.rml => complex-text-update.right.html} | 14 +- tests/test_data/example.expected.html | 11 + tests/test_data/example.left.html | 14 + tests/test_data/example.right.html | 14 + tests/test_data/insert-node.expected.html | 7 + tests/test_data/insert-node.expected.rml | 7 - tests/test_data/insert-node.left.html | 4 + tests/test_data/insert-node.left.rml | 4 - tests/test_data/insert-node.right.html | 5 + tests/test_data/insert-node.right.rml | 7 - .../no-text-substitutions.expected.rml | 7 - .../test_data/no-text-substitutions.left.rml | 4 - .../test_data/no-text-substitutions.right.rml | 5 - tests/test_data/rmldoc.expected.rml | 300 ---------- tests/test_data/rmldoc.left.rml | 508 ----------------- tests/test_data/rmldoc.right.rml | 519 ------------------ tests/test_formatting.py | 10 +- tests/test_main.py | 16 +- xmldiff/diff.py | 5 + xmldiff/formatting.py | 20 +- xmldiff/main.py | 18 +- 28 files changed, 140 insertions(+), 1445 deletions(-) create mode 100644 tests/test_data/complex-text-update.expected.html delete mode 100644 tests/test_data/complex-text-update.expected.rml rename tests/test_data/{complex-text-update.left.rml => complex-text-update.left.html} (72%) rename tests/test_data/{complex-text-update.right.rml => complex-text-update.right.html} (61%) create mode 100644 tests/test_data/example.expected.html create mode 100644 tests/test_data/example.left.html create mode 100644 tests/test_data/example.right.html create mode 100644 tests/test_data/insert-node.expected.html delete mode 100644 tests/test_data/insert-node.expected.rml create mode 100644 tests/test_data/insert-node.left.html delete mode 100644 tests/test_data/insert-node.left.rml create mode 100644 tests/test_data/insert-node.right.html delete mode 100644 tests/test_data/insert-node.right.rml delete mode 100644 tests/test_data/no-text-substitutions.expected.rml delete mode 100644 tests/test_data/no-text-substitutions.left.rml delete mode 100644 tests/test_data/no-text-substitutions.right.rml delete mode 100644 tests/test_data/rmldoc.expected.rml delete mode 100644 tests/test_data/rmldoc.left.rml delete mode 100644 tests/test_data/rmldoc.right.rml diff --git a/CHANGES.rst b/CHANGES.rst index d741c34..65762ba 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -4,7 +4,8 @@ Changes 2.0b3 (unreleased) ------------------ -- Nothing changed yet. +- Replaced the example RMLFormatter with a more generic HTML formatter, + although it only handles HTML snippets at the moment. 2.0b2 (2018-09-06) diff --git a/MANIFEST.in b/MANIFEST.in index 83f9a3f..139ac93 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -5,7 +5,7 @@ include .coveragerc include Makefile recursive-include tests *.py recursive-include tests *.xml -recursive-include tests *.rml +recursive-include tests *.html recursive-include docs *.bat recursive-include docs *.py recursive-include docs *.rst diff --git a/docs/source/api.rst b/docs/source/api.rst index 8789fac..5e8b165 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -11,13 +11,11 @@ you just import and call one of the three main API methods. :options: -ELLIPSIS, +NORMALIZE_WHITESPACE >>> from xmldiff import main - >>> main.diff_files("../tests/test_data/insert-node.left.rml", - ... "../tests/test_data/insert-node.right.rml") - [UpdateTextIn(node='/document/story[1]', text=None), - InsertNode(target='/document/story[1]', tag='h1', position=0), - UpdateTextIn(node='/document/story/h1[1]', text='Inserted '), - InsertNode(target='/document/story/h1[1]', tag='i', position=0), - UpdateTextIn(node='/document/story/h1/i[1]', text='Node')] + >>> main.diff_files("../tests/test_data/insert-node.left.html", + ... "../tests/test_data/insert-node.right.html") + [UpdateTextIn(node='/body/div[1]', text=None), + InsertNode(target='/body/div[1]', tag='p', position=0), + UpdateTextIn(node='/body/div/p[1]', text='Simple text')] Which one you choose depends on if the XML is contained in files, text strings or ``lxml`` trees. @@ -62,7 +60,7 @@ If no formatter is specified the diff functions will return a list of actions. Such a list is called an edit script and contains all changes needed to transform the "left" XML into the "right" XML. If a formatter is specified that formatter determines the result. -The included formatters, ``diff``, ``xml``, and ``rml`` all return a Unicode string. +The included formatters, ``diff``, ``xml``, and ``html`` all return a Unicode string. Unique Attributes @@ -88,7 +86,7 @@ Using Formatters By default the diff functions will return an edit script, but if you pass in a formatter the result will be whatever that formatter returns. -The three included formatters, ``diff``, ``xml`` and ``rml``, +The three included formatters, ``diff``, ``xml`` and ``html``, all return Unicode strings. The ``diff`` formatter will return a string with the edit script printed out, one action per line. @@ -103,14 +101,12 @@ so the output is not compatible. >>> from xmldiff import formatting >>> formatter = formatting.DiffFormatter() - >>> print(main.diff_files("../tests/test_data/insert-node.left.rml", - ... "../tests/test_data/insert-node.right.rml", + >>> print(main.diff_files("../tests/test_data/insert-node.left.html", + ... "../tests/test_data/insert-node.right.html", ... formatter=formatter)) - [update-text, /document/story[1], null] - [insert, /document/story[1], h1, 0] - [update-text, /document/story/h1[1], "Inserted "] - [insert, /document/story/h1[1], i, 0] - [update-text, /document/story/h1/i[1], "Node"] + [update-text, /body/div[1], null] + [insert, /body/div[1], p, 0] + [update-text, /body/div/p[1], "Simple text"] The other two differs return XML with tags describing the changes. @@ -121,17 +117,17 @@ for example with XSLT replacing the tags with the format you need. :options: -ELLIPSIS, +NORMALIZE_WHITESPACE >>> from xmldiff import formatting - >>> formatter = formatting.RMLFormatter() - >>> print(main.diff_files("../tests/test_data/insert-node.left.rml", - ... "../tests/test_data/insert-node.right.rml", + >>> formatter = formatting.HTMLFormatter() + >>> print(main.diff_files("../tests/test_data/insert-node.left.html", + ... "../tests/test_data/insert-node.right.html", ... formatter=formatter)) - - -

- Inserted Node -

-
-
+ +
+

+ Simple text +

+
+ The Edit Script diff --git a/docs/source/commandline.rst b/docs/source/commandline.rst index 7a4e460..82cff26 100644 --- a/docs/source/commandline.rst +++ b/docs/source/commandline.rst @@ -63,14 +63,11 @@ since the whitespace there occurs inside a tag: In some XML formats, whitespace inside some tags is also not significant. -The ``rml`` formatter is an example of this. -It's a format that can be used to generate documents, -and has a ```` tag for formatted text, -similar to HTML's ``

`` tag. -The ``rml`` formatter is aware of this, +The ``html`` formatter is an example of this. + It is aware of that ``

`` tags contain text where whitespace isn't significant, and will by default normalize whitespace inside these tags before comparing it, effectively replacing any whitespace inside those tags to a single space. -This is so that when diffing two versions of RML files you will not see changes that would not be visible in the final document. +This is so that when diffing two versions of HTML files you will not see changes that would not be visible in the final document. Both of these types of whitespace can be preserved with the ``--keep-whitespace`` argument. The third case of whitespace, @@ -87,7 +84,7 @@ The term "pretty printing" refers to making an output a bit more human readable In the case of XML this means inserting ignorable whitespace into the XML, yes, the same in-between whitespace that is ignored by ``xmldiff`` when detecting changes between two files. -``xmldiff``'s ``xml`` and ``rml`` formatters understand the ``--pretty-print`` argument and will insert whitespace to make the output more readable. +``xmldiff``'s ``xml`` and ``html`` formatters understand the ``--pretty-print`` argument and will insert whitespace to make the output more readable. For example, an XML output that would normally look like this: diff --git a/tests/test_data/complex-text-update.expected.html b/tests/test_data/complex-text-update.expected.html new file mode 100644 index 0000000..b4bef51 --- /dev/null +++ b/tests/test_data/complex-text-update.expected.html @@ -0,0 +1,5 @@ + +

+

Let's see. This is some simple text demonstrating the features of the human text differ. This feature attempts to make changelog readable for humans. The human text differ uses sentences as its first order matching. Let's see.It should handle unknown tags just fine.

+
+ diff --git a/tests/test_data/complex-text-update.expected.rml b/tests/test_data/complex-text-update.expected.rml deleted file mode 100644 index d597ae1..0000000 --- a/tests/test_data/complex-text-update.expected.rml +++ /dev/null @@ -1,5 +0,0 @@ - - - Let's see. This is some simple text demonstrating the features of the human text differ. This feature attempts to make changelog readable for humans. The human text differ uses sentences as its first order matching. Let's see. - - diff --git a/tests/test_data/complex-text-update.left.rml b/tests/test_data/complex-text-update.left.html similarity index 72% rename from tests/test_data/complex-text-update.left.rml rename to tests/test_data/complex-text-update.left.html index 7eaadd1..fdc2fef 100644 --- a/tests/test_data/complex-text-update.left.rml +++ b/tests/test_data/complex-text-update.left.html @@ -1,12 +1,12 @@ - - + +
- +

This is some simple text demonstrating the features of the human text differ. This feature attempts to make changelog readable for humans. The human text differ uses sentences as its first order matching. Let's see. - +

- - +
+ diff --git a/tests/test_data/complex-text-update.right.rml b/tests/test_data/complex-text-update.right.html similarity index 61% rename from tests/test_data/complex-text-update.right.rml rename to tests/test_data/complex-text-update.right.html index 60e8133..d72d590 100644 --- a/tests/test_data/complex-text-update.right.rml +++ b/tests/test_data/complex-text-update.right.html @@ -1,12 +1,12 @@ - - + +
- +

Let's see. This is some simple text demonstrating the features of the human text differ. This feature attempts to make changelog readable for humans. The human text differ uses sentences as its - first order matching. - + first order matching. It should handle unknown tags just fine. +

- - +
+ diff --git a/tests/test_data/example.expected.html b/tests/test_data/example.expected.html new file mode 100644 index 0000000..7e527cc --- /dev/null +++ b/tests/test_data/example.expected.html @@ -0,0 +1,11 @@ + + + + <diff:insert>HTML </diff:insert>Example<diff:delete> HTML</diff:delete> + + +

My First Heading

+

My first paragraph has changed.

+ + + diff --git a/tests/test_data/example.left.html b/tests/test_data/example.left.html new file mode 100644 index 0000000..59e3b3a --- /dev/null +++ b/tests/test_data/example.left.html @@ -0,0 +1,14 @@ + + + +Example HTML + + + +

My First Heading

+ +

My first paragraph.

+ + + + diff --git a/tests/test_data/example.right.html b/tests/test_data/example.right.html new file mode 100644 index 0000000..37e6d4a --- /dev/null +++ b/tests/test_data/example.right.html @@ -0,0 +1,14 @@ + + + +HTML Example + + + +

My First Heading

+ +

My paragraph has changed.

+ + + + diff --git a/tests/test_data/insert-node.expected.html b/tests/test_data/insert-node.expected.html new file mode 100644 index 0000000..4cc7ea6 --- /dev/null +++ b/tests/test_data/insert-node.expected.html @@ -0,0 +1,7 @@ + +
+

+ Simple text +

+
+ diff --git a/tests/test_data/insert-node.expected.rml b/tests/test_data/insert-node.expected.rml deleted file mode 100644 index 14f0f6c..0000000 --- a/tests/test_data/insert-node.expected.rml +++ /dev/null @@ -1,7 +0,0 @@ - - -

- Inserted Node -

-
-
diff --git a/tests/test_data/insert-node.left.html b/tests/test_data/insert-node.left.html new file mode 100644 index 0000000..49429e4 --- /dev/null +++ b/tests/test_data/insert-node.left.html @@ -0,0 +1,4 @@ + +
+
+ diff --git a/tests/test_data/insert-node.left.rml b/tests/test_data/insert-node.left.rml deleted file mode 100644 index 399b568..0000000 --- a/tests/test_data/insert-node.left.rml +++ /dev/null @@ -1,4 +0,0 @@ - - - - diff --git a/tests/test_data/insert-node.right.html b/tests/test_data/insert-node.right.html new file mode 100644 index 0000000..8e71f88 --- /dev/null +++ b/tests/test_data/insert-node.right.html @@ -0,0 +1,5 @@ + +
+

Simple text

+
+ diff --git a/tests/test_data/insert-node.right.rml b/tests/test_data/insert-node.right.rml deleted file mode 100644 index 4517c99..0000000 --- a/tests/test_data/insert-node.right.rml +++ /dev/null @@ -1,7 +0,0 @@ - - - -

Inserted Node

- -
-
diff --git a/tests/test_data/no-text-substitutions.expected.rml b/tests/test_data/no-text-substitutions.expected.rml deleted file mode 100644 index aa7eccc..0000000 --- a/tests/test_data/no-text-substitutions.expected.rml +++ /dev/null @@ -1,7 +0,0 @@ - - - - Simple text - - - diff --git a/tests/test_data/no-text-substitutions.left.rml b/tests/test_data/no-text-substitutions.left.rml deleted file mode 100644 index 399b568..0000000 --- a/tests/test_data/no-text-substitutions.left.rml +++ /dev/null @@ -1,4 +0,0 @@ - - - - diff --git a/tests/test_data/no-text-substitutions.right.rml b/tests/test_data/no-text-substitutions.right.rml deleted file mode 100644 index 74920dd..0000000 --- a/tests/test_data/no-text-substitutions.right.rml +++ /dev/null @@ -1,5 +0,0 @@ - - - Simple text - - diff --git a/tests/test_data/rmldoc.expected.rml b/tests/test_data/rmldoc.expected.rml deleted file mode 100644 index 9632cdf..0000000 --- a/tests/test_data/rmldoc.expected.rml +++ /dev/null @@ -1,300 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/tests/test_data/rmldoc.left.rml b/tests/test_data/rmldoc.left.rml deleted file mode 100644 index 3e191b5..0000000 --- a/tests/test_data/rmldoc.left.rml +++ /dev/null @@ -1,508 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/tests/test_data/rmldoc.right.rml b/tests/test_data/rmldoc.right.rml deleted file mode 100644 index 5932065..0000000 --- a/tests/test_data/rmldoc.right.rml +++ /dev/null @@ -1,519 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/tests/test_formatting.py b/tests/test_formatting.py index 06fab4e..04808fa 100644 --- a/tests/test_formatting.py +++ b/tests/test_formatting.py @@ -360,16 +360,16 @@ class XMLFormatterFileTests(FormatterFileTests): normalize=formatting.WS_TEXT) -class RMLFormatterFileTests(FormatterFileTests): +class HTMLFormatterFileTests(FormatterFileTests): - # We use the RMLFormatter for the placeholder tests - formatter = formatting.RMLFormatter() + # We use the HTMLFormatter for the placeholder tests + formatter = formatting.HTMLFormatter() # Add tests that use no placeholder replacement (ie plain XML) data_dir = os.path.join(os.path.dirname(__file__), 'test_data') generate_filebased_cases(data_dir, XMLFormatterFileTests) -# Add tests that use placeholder replacement (ie RML) +# Add tests that use placeholder replacement (ie HTML) data_dir = os.path.join(os.path.dirname(__file__), 'test_data') -generate_filebased_cases(data_dir, RMLFormatterFileTests, suffix='rml') +generate_filebased_cases(data_dir, HTMLFormatterFileTests, suffix='html') diff --git a/tests/test_main.py b/tests/test_main.py index 5d3c4ea..e39788f 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -98,19 +98,19 @@ def test_cli_no_args(self): def test_cli_simple(self): curdir = os.path.dirname(__file__) filepath = os.path.join(curdir, 'test_data') - file1 = os.path.join(filepath, 'insert-node.left.rml') - file2 = os.path.join(filepath, 'insert-node.right.rml') + file1 = os.path.join(filepath, 'insert-node.left.html') + file2 = os.path.join(filepath, 'insert-node.right.html') output, errors = self.call_run([file1, file2]) - self.assertEqual(len(output.splitlines()), 5) + self.assertEqual(len(output.splitlines()), 3) # This should default to the diff formatter: self.assertEqual(output[0], '[') def test_cli_args(self): curdir = os.path.dirname(__file__) filepath = os.path.join(curdir, 'test_data') - file1 = os.path.join(filepath, 'insert-node.left.rml') - file2 = os.path.join(filepath, 'insert-node.right.rml') + file1 = os.path.join(filepath, 'insert-node.left.html') + file2 = os.path.join(filepath, 'insert-node.right.html') # Select a formatter: output, errors = self.call_run([file1, file2, '--formatter', 'xml']) @@ -122,9 +122,9 @@ def test_cli_args(self): # Don't strip the whitespace keeps the formatting from the source: output, errors = self.call_run([file1, file2, '--keep-whitespace', '--formatter', 'xml']) - self.assertEqual(len(output.splitlines()), 7) + self.assertEqual(len(output.splitlines()), 5) # And stripping and pretty printing gives a longer readable output output, errors = self.call_run([file1, file2, '--pretty-print', - '--formatter', 'xml']) - self.assertEqual(len(output.splitlines()), 11) + '--formatter', 'html']) + self.assertEqual(len(output.splitlines()), 8) diff --git a/xmldiff/diff.py b/xmldiff/diff.py index 2dd61ed..a3a1eeb 100644 --- a/xmldiff/diff.py +++ b/xmldiff/diff.py @@ -48,6 +48,11 @@ def set_trees(self, left, right): self.clear() # Make sure we were passed two lxml elements: + if isinstance(left, etree._ElementTree): + left = left.getroot() + if isinstance(right, etree._ElementTree): + right = right.getroot() + if not (etree.iselement(left) and etree.iselement(right)): raise TypeError("The 'left' and 'right' parameters must be " "lxml Elements.") diff --git a/xmldiff/formatting.py b/xmldiff/formatting.py index 497c903..3bc115e 100644 --- a/xmldiff/formatting.py +++ b/xmldiff/formatting.py @@ -41,7 +41,7 @@ def __init__(self, normalize=WS_TAGS, pretty_print=False): normalized with the included formatters. pretty_print is used to choose between a compact and a pretty output. - This is currently only used by the XML and RML formatters. + This is currently only used by the XML and HTML formatters. Formatters may of course have more options than these, but these two are the ones that can be set from the command line. @@ -312,6 +312,11 @@ def format(self, diff, orig_tree): # and also because we don't want to modify the original tree. result = deepcopy(orig_tree) + if isinstance(result, etree._ElementTree): + root = result.getroot() + else: + root = result + etree.register_namespace(DIFF_PREFIX, DIFF_NS) deferred = [] @@ -320,12 +325,12 @@ def format(self, diff, orig_tree): # We need to do text updates last deferred.append(action) continue - self.handle_action(action, result) + self.handle_action(action, root) for action in reversed(deferred): - self.handle_action(action, result) + self.handle_action(action, root) - self.finalize(result) + self.finalize(root) etree.cleanup_namespaces(result, top_nsmap={DIFF_PREFIX: DIFF_NS}) return etree.tounicode(result, pretty_print=self.pretty_print) @@ -555,13 +560,14 @@ def _handle_UpdateTextAfter(self, action, tree): return node -class RMLFormatter(XMLFormatter): +class HTMLFormatter(XMLFormatter): + """A formatter that understands HTML snippets""" def __init__(self, normalize=WS_BOTH, pretty_print=True, - text_tags=('para', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'), + text_tags=('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li'), formatting_tags=('b', 'u', 'i', 'strike', 'em', 'super', 'sup', 'sub', 'link', 'a', 'span')): - super(RMLFormatter, self).__init__( + super(HTMLFormatter, self).__init__( normalize=normalize, pretty_print=pretty_print, text_tags=text_tags, formatting_tags=formatting_tags) diff --git a/xmldiff/main.py b/xmldiff/main.py index 758670a..5540e3b 100644 --- a/xmldiff/main.py +++ b/xmldiff/main.py @@ -7,13 +7,15 @@ __version__ = pkg_resources.require("xmldiff")[0].version +FORMATTERS = { + 'diff': formatting.DiffFormatter, + 'xml': formatting.XMLFormatter, + 'html': formatting.HTMLFormatter, +} + def diff_trees(left, right, F=0.5, uniqueattrs=None, formatter=None): """Takes two lxml root elements or element trees""" - if isinstance(left, etree._ElementTree): - left = left.getroot() - if isinstance(right, etree._ElementTree): - right = right.getroot() if formatter is not None: formatter.prepare(left, right) differ = diff.Differ(F=F, uniqueattrs=uniqueattrs) @@ -52,7 +54,7 @@ def make_parser(): parser.add_argument('file2', type=FileType('r'), help='the second input file') parser.add_argument('-f', '--formatter', default='diff', - choices=['diff', 'xml', 'rml'], + choices=list(FORMATTERS.keys()), help='formatter selection') parser.add_argument('-w', '--keep-whitespace', action='store_true', help="do not strip ignorable whitespace") @@ -73,12 +75,6 @@ def run(args=None): else: normalize = formatting.WS_BOTH - FORMATTERS = { - 'diff': formatting.DiffFormatter, - 'xml': formatting.XMLFormatter, - 'rml': formatting.RMLFormatter, - } - formatter = FORMATTERS[args.formatter](normalize=normalize, pretty_print=args.pretty_print) result = diff_files(args.file1, args.file2, formatter=formatter)