From 3b8297494eee3db2a628fef544fb2db0255fc58e Mon Sep 17 00:00:00 2001 From: Lennart Regebro Date: Mon, 25 Jun 2018 18:11:59 +0200 Subject: [PATCH] Once again fix the namespace handling The original fix fixed too much. This fix does the right thing, but changes the return format of the parser by adding a field for the namespace prefix. --- CHANGES.rst | 6 ++- src/xmldiff/fmes.py | 2 +- src/xmldiff/objects.py | 23 +++++++-- src/xmldiff/parser.py | 64 ++++++++++++++++-------- tests/test_parser.py | 111 ++++++++++++++++++++++++----------------- tests/test_regrtest.py | 11 ++-- 6 files changed, 135 insertions(+), 82 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 6c27a8e..266a2f4 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -4,7 +4,11 @@ CHANGES 1.1.2 (unreleased) ------------------ -- Nothing changed yet. +- When I fixed the xpath namespace handling I also changed the tag names to + an xpath syntax. This was unhelpful, so I changed that back. To solve this + I have had to extend the return format from the parser and ass a N_NSPREFIX + that contains the prefix. This is used by the differ to return correct + xpaths without changing the tags. 1.1.1 (2018-06-20) diff --git a/src/xmldiff/fmes.py b/src/xmldiff/fmes.py index f7965e9..ec21cd7 100644 --- a/src/xmldiff/fmes.py +++ b/src/xmldiff/fmes.py @@ -384,7 +384,7 @@ def _before_attribute(self, parent_node, attr_node, new_name=None): return attr_name FAKE_TAG = [NT_NODE, 'LogilabXMLDIFFFAKETag', 'LogilabXMLDIFFFAKETag', - [], None, 0, 0, True, False] + [], None, 0, 0, None, True, False] def _before_insert_text(self, parent, new_text, k): """ check if a text node that will be remove has two sibbling text diff --git a/src/xmldiff/objects.py b/src/xmldiff/objects.py index 08926dd..35384e2 100644 --- a/src/xmldiff/objects.py +++ b/src/xmldiff/objects.py @@ -41,7 +41,8 @@ N_PARENT = 4 # node's parent N_ISSUE = 5 # node's total issue number N_XNUM = 6 # to compute node's xpath -NSIZE = 7 # number of items in a list which represent a node +N_NSPREFIX = 7 # node's namespace prefix (if any) +NSIZE = 8 # number of items in a list which represent a node # NODE TYPES # NT_SYST = 0 # SYSTEM node (added by parser) /!\ deprecated @@ -120,14 +121,26 @@ def caract(node): def f_xpath(node, x=''): """ compute node's xpath """ - if node[N_NAME] != '/': + name = node[N_NAME] + if '{' in name: + # We have a namespace + pre, rest = name.split('{', 1) + uri, local_name = rest.split('}', 1) + prefix = node[N_NSPREFIX] + if prefix is None: + # Default namespace + name = pre + local_name + else: + name = '%s%s:%s' % (pre, prefix, local_name) + + if name != '/': if node[N_TYPE] == NT_ATTN: return f_xpath(node[N_PARENT], - '/%s' % node[N_NAME][:len(node[N_NAME]) - 4]) + '/%s' % name[:len(name) - 4]) if node[N_TYPE] == NT_ATTV: - return f_xpath(node[N_PARENT]) # [N_PARENT], '/%s'%node[N_NAME]) + return f_xpath(node[N_PARENT]) # [N_PARENT], '/%s'%name) return f_xpath(node[N_PARENT], '/%s[%d]%s' % ( - node[N_NAME], node[N_XNUM], x)) + name, node[N_XNUM], x)) elif not x: return '/' return x diff --git a/src/xmldiff/parser.py b/src/xmldiff/parser.py index 936e631..835fc90 100644 --- a/src/xmldiff/parser.py +++ b/src/xmldiff/parser.py @@ -43,7 +43,7 @@ class SaxHandler(ContentHandler): """ def __init__(self, normalize_space, include_comment): - self._p_stack = [[NT_ROOT, '/', '', [], None, 0, 0]] + self._p_stack = [[NT_ROOT, '/', '', [], None, 0, 0, None]] self._norm_sp = normalize_space or None self._incl_comm = include_comment or None self._xpath = '' @@ -70,40 +70,59 @@ def endPrefixMapping(self, prefix): def _buildTag(self, ns_name_tuple): ns_uri, local_name = ns_name_tuple - - if ns_uri and ns_uri != self._default_ns: - ns_name = [x[0] for x in self._ns_mapping.items() - if ns_uri in x[1]][0] - return "%s:%s" % (ns_name, local_name) - + if ns_uri: + el_tag = "{%s}%s" % ns_name_tuple + else: + el_tag = local_name + return el_tag + + def _getPrefix(self, ns_uri): + if not ns_uri: + return None + for (prefix, uri) in self._ns_mapping.items(): + if ns_uri in uri: + return prefix + if ns_uri == 'http://www.w3.org/XML/1998/namespace': + # It's the xml: namespace, undeclared. + return 'xml' + raise ValueError("No prefix found for namespace URI %s" % ns_uri) + + # Don't know if I need this + def _buildXPath(self, ns_name_tuple): + ns_uri, local_name = ns_name_tuple + if ns_uri: + prefix = self._getPrefix(ns_uri) + return '%s:%s' % (prefix, local_name) return local_name ## method of the ContentHandler interface ################################# - def startElementNS(self, name, qname, attributes): - if attributes: - attributes = dict( - [(self._buildTag(k), v) for k, v in attributes.items()]) - self.startElement(self._buildTag(name), attributes) - def startElement(self, name, attrs): + self.startElementNS((None, name), None, attrs) + + def startElementNS(self, name, qname, attrs): + tagName = self._buildTag(name) + prefix = self._getPrefix(name[0]) + # process xpath self._xpath = "%s%s%s" % (self._xpath, '/', name) _inc_xpath(self._h, self._xpath) # nodes construction for element - node = [NT_NODE, name, name, [], None, self._n_elmt + 1, - self._h[self._xpath]] + node = [NT_NODE, tagName, tagName, [], None, self._n_elmt + 1, + self._h[self._xpath], prefix] self._n_elmt += 1 self._xpath = "%s%s%s%s" % ( self._xpath, '[', self._h[self._xpath], ']') # nodes construction for element's attributes # sort attributes to avoid further moves - for key in sorted(attrs.keys()): + for key, value in sorted(attrs.items()): self._n_elmt += 2 - attr_node = [NT_ATTN, '@%sName' % key, key, [], None, 1, 0] + attrName = self._buildTag(key) + prefix = self._getPrefix(key[0]) + attr_node = [NT_ATTN, '@%sName' % attrName, attrName, [], None, + 1, 0, prefix] link_node(node, attr_node) - link_node(attr_node, [NT_ATTV, '@%s' % key, - attrs.get(key, ''), - [], None, 0, 0]) + link_node(attr_node, [NT_ATTV, '@%s' % attrName, value, + [], None, 0, 0, prefix]) link_node(self._p_stack[-1], node) # set current element on the top of the father stack @@ -138,7 +157,8 @@ def characters(self, ch): xpath = '%s/text()' % self._xpath _inc_xpath(self._h, xpath) # nodes construction for text - node = [NT_TEXT, 'text()', ch, [], None, 0, self._h[xpath]] + node = [NT_TEXT, 'text()', ch, [], None, 0, + self._h[xpath], None] link_node(parent, node) ## method of the LexicalHandler interface ################################# @@ -153,7 +173,7 @@ def comment(self, content): _inc_xpath(self._h, xpath) # nodes construction for comment node = [NT_COMM, 'comment()', content, [], None, - 0, self._h[xpath]] + 0, self._h[xpath], None] link_node(self._p_stack[-1], node) # methods from xml.sax.saxlib.LexicalHandler (avoid dependency on pyxml) diff --git a/tests/test_parser.py b/tests/test_parser.py index 35cea64..2a28fb7 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -26,7 +26,7 @@ from xmldiff.input import tree_from_stream from xmldiff.input import tree_from_lxml -from xmldiff.objects import N_VALUE, N_CHILDS, N_PARENT +from xmldiff.objects import N_VALUE, N_CHILDS, N_PARENT, N_NSPREFIX HERE = os.path.dirname(__file__) @@ -63,8 +63,8 @@ def test_tree_from_stream_simple(): [[1, u'a', u'a', - [[1, u'b', u'b', [], mock.ANY, 0, 1], - [1, u'c', u'c', [], mock.ANY, 0, 1], + [[1, u'b', u'b', [], mock.ANY, 0, 1, None], + [1, u'c', u'c', [], mock.ANY, 0, 1, None], [1, u'd', u'd', @@ -77,26 +77,32 @@ def test_tree_from_stream_simple(): [], mock.ANY, 0, - 1]], + 1, + None]], mock.ANY, 1, - 1], + 1, + None], [1, u'f', u'f', [], mock.ANY, 0, - 1]], + 1, + None]], mock.ANY, 3, - 1]], + 1, + None]], mock.ANY, 6, - 1]], + 1, + None]], None, 7, - 0] + 0, + None] assert tree == expected @@ -150,74 +156,85 @@ def test_tree_from_stream_with_namespace(): '/', '', [[1, - u'sec:section', - u'sec:section', + u'{urn:corp:sec}section', + u'{urn:corp:sec}section', [[1, - u'sec:sectionInfo', - u'sec:sectionInfo', + u'{urn:corp:sec}sectionInfo', + u'{urn:corp:sec}sectionInfo', [[1, - u'sec:secID', - u'sec:secID', - [[4, 'text()', u'S001', [], None, 0, 1]], + u'{urn:corp:sec}secID', + u'{urn:corp:sec}secID', + [[4, 'text()', u'S001', [], None, 0, 1, None]], None, 1, - 1], + 1, + 'sec'], [1, - u'sec:name', - u'sec:name', - [[4, 'text()', u'Sales', [], None, 0, 1]], + u'{urn:corp:sec}name', + u'{urn:corp:sec}name', + [[4, 'text()', u'Sales', [], None, 0, 1, None]], None, 1, - 1]], + 1, + 'sec']], None, 4, - 1], + 1, + 'sec'], [1, - u'sec:sectionInfo', - u'sec:sectionInfo', + u'{urn:corp:sec}sectionInfo', + u'{urn:corp:sec}sectionInfo', [[2, u'@nameName', u'name', - [[3, u'@name', u'Development', [], None, 0, 0]], + [[3, u'@name', u'Development', [], None, 0, 0, None]], None, 1, - 0], + 0, + None], [2, u'@secIDName', u'secID', - [[3, u'@secID', u'S002', [], None, 0, 0]], + [[3, u'@secID', u'S002', [], None, 0, 0, None]], None, 1, - 0]], + 0, + None]], None, 4, - 2], + 2, + 'sec'], [1, - u'sec:sectionInfo', - u'sec:sectionInfo', + u'{urn:corp:sec}sectionInfo', + u'{urn:corp:sec}sectionInfo', [[2, - u'@sec:nameName', - u'sec:name', - [[3, u'@sec:name', u'Gardening', [], None, 0, 0]], + u'@{urn:corp:sec}nameName', + u'{urn:corp:sec}name', + [[3, u'@{urn:corp:sec}name', u'Gardening', [], None, 0, 0, 'sec']], None, 1, - 0], + 0, + 'sec'], [2, - u'@sec:secIDName', - u'sec:secID', - [[3, u'@sec:secID', u'S003', [], None, 0, 0]], + u'@{urn:corp:sec}secIDName', + u'{urn:corp:sec}secID', + [[3, u'@{urn:corp:sec}secID', u'S003', [], None, 0, 0, 'sec']], None, 1, - 0]], + 0, + 'sec']], None, 4, - 3]], + 3, + 'sec']], None, 15, - 1]], + 1, + 'sec']], None, 16, - 0] + 0, + None] assert tree == expected @@ -246,8 +263,8 @@ def test_tree_from_lxml(): # This is only to fix this test, using xmldiff with these versions of # lxml will still work, but the prefixes will be wrong. def fix_lxml_421_tree(t, prefix): - t[1] = t[1].replace('ns00:', prefix) - t[2] = t[2].replace('ns00:', prefix) + if t[N_NSPREFIX] == 'ns00': + t[N_NSPREFIX] = prefix for subtree in t[3]: fix_lxml_421_tree(subtree, prefix) @@ -264,7 +281,7 @@ def test_tree_from_lxml_with_namespace(): _nuke_parent(tree_stream) # lxml <= 4.2.1 - fix_lxml_421_tree(tree, 'sec:') + fix_lxml_421_tree(tree, 'sec') assert tree == tree_stream @@ -279,7 +296,7 @@ def test_tree_from_lxml_with_namespace(): _nuke_parent(tree_stream) # lxml <= 4.2.1 - fix_lxml_421_tree(tree, 'z:') + fix_lxml_421_tree(tree, 'z') assert tree == tree_stream @@ -295,7 +312,7 @@ def test_tree_from_lxml_with_default_namespace(): _nuke_parent(tree) _nuke_parent(tree_stream) - fix_lxml_421_tree(tree, '') + fix_lxml_421_tree(tree, None) assert tree == tree_stream diff --git a/tests/test_regrtest.py b/tests/test_regrtest.py index ec6f4e4..19fa74b 100644 --- a/tests/test_regrtest.py +++ b/tests/test_regrtest.py @@ -34,15 +34,15 @@ def get_output(options): backup = sys.stdout # capture stdout - sys.stdout = six.StringIO() + sys.stdout = out = six.StringIO() try: main.run(options) except SystemExit: pass finally: - output = sys.stdout.getvalue().strip() - sys.stdout.close() sys.stdout = backup + output = out.getvalue().strip() + out.close() return output @@ -157,9 +157,8 @@ def test_known(fnames, lcs2_type): old = fnames['old'] new = fnames['new'] res_file = fnames['result'] - f = open(res_file) - expected = f.read().strip() - f.close() + with open(res_file) as f: + expected = f.read().strip() options = [old, new] data = get_output(options) assert data == expected, '%s:\n%r != %r' % (options, data, expected)