Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

Already on GitHub? Sign in to your account

Deal with smartTag and links with multiple runs #20

Merged
merged 6 commits into from Mar 22, 2013
View
@@ -2,6 +2,11 @@
Changelog
=========
+* 0.1.8
+ * Fixed missing content with hyperlinks with more than one run tag and
+ smartTags.
+ * Certain image types are now being ignored. These include: emf, wmf and
+ svg.
* 0.1.7
* If the indentation level of a set of lists (with the same list id) were
mangled (Starting off with a higher indentation level followed by a
View
@@ -1,4 +1,5 @@
import cgi
+import logging
import os
import os.path
import re
@@ -19,6 +20,9 @@
DETECT_FONT_SIZE = False
EMUS_PER_PIXEL = 9525
+NSMAP = {}
+
+logger = logging.getLogger(__name__)
###
# Help functions
@@ -61,7 +65,9 @@ def wrap(*args, **kwargs):
def get_namespace(el, namespace):
- return '{%s}' % el.nsmap[namespace]
+ if namespace not in NSMAP:
+ NSMAP[namespace] = '{%s}' % el.nsmap[namespace]
+ return NSMAP[namespace]
def convert_image(target, image_size):
@@ -749,6 +755,8 @@ def get_relationship_info(tree, media, image_sizes):
continue
# Store the target in the result dict.
target = el.get('Target')
+ if any(target.lower().endswith(ext) for ext in ['emf', 'wmf', 'svg']):
@winhamwr

winhamwr Mar 22, 2013

Member

The update note should mention we're now ignoring these images and this should be a constant that's declared and commented as far as why they're ignored.

@winhamwr

winhamwr Mar 22, 2013

Member

Need a test that these are ignored.

+ continue
if target in media:
image_size = image_sizes.get(el_id)
target = convert_image(media[target], image_size)
@@ -848,7 +856,9 @@ def image_handler(image_id, relationship_dict):
image_sizes
)
styles_dict = get_style_dict(styles_xml)
- font_sizes_dict = get_font_sizes_dict(document_xml, styles_dict)
+ font_sizes_dict = defaultdict(int)
+ if DETECT_FONT_SIZE:
@winhamwr

winhamwr Mar 22, 2013

Member

Why did we start using this flag again?

@jlward

jlward Mar 22, 2013

Member

What do you mean again, we have always been using this flag. I added this as a bit of an optimization since the font_size stuff is not used anywhere.

@winhamwr

winhamwr Mar 22, 2013

Member

Looking at the diff, it was added with this pull request. Do you know why?

@jlward

jlward Mar 22, 2013

Member

We are using DETECT_FONT_SIZE in other places as well. I only added it here because we are not actually using the font_sizes_dict so why take the overhead of populating it.

@winhamwr

winhamwr Mar 22, 2013

Member

Gotcha. Just wanted to make sure there was a reason, since font size stuff wasn't mentioned anywhere in the spec or update note.

+ font_sizes_dict = get_font_sizes_dict(document_xml, styles_dict)
meta_data = MetaData(
numbering_dict=numbering_dict,
relationship_dict=relationship_dict,
@@ -1177,6 +1187,7 @@ def get_p_data(p, meta_data, is_td=False):
'%sr' % w_namespace,
'%shyperlink' % w_namespace,
'%sins' % w_namespace,
+ '%ssmartTag' % w_namespace,
)
elements = []
# Get the tags that are r tags or hyperlink tags
@@ -1193,7 +1204,7 @@ def get_p_data(p, meta_data, is_td=False):
hyperlink_id = None
# Hyperlinks and insert tags need to be handled differently than
# normals runs.
- if el.tag == '%sins' % w_namespace:
+ if el.tag in ('%sins' % w_namespace, '%ssmartTag' % w_namespace):
# Insert tags can have an arbitrary number of r tags in them. Find
# each and insert them into the elements list as the next elements
# in reverse order.
@@ -1210,9 +1221,13 @@ def get_p_data(p, meta_data, is_td=False):
hyperlink_id = el.get('%sid' % r_namespace)
# Once we have the hyperlink_id then we need to replace the
- # hyperlink tag with its child run tag.
- child_run_tag = el.find('%sr' % w_namespace)
- if child_run_tag is None:
+ # hyperlink tags with its child run tag.
@winhamwr

winhamwr Mar 22, 2013

Member

Read this again. We're getting child run tags (multiple)

+ text = ''
+ r = None
+ for r in el.xpath('.//w:r', namespaces=el.nsmap):
+ for child in get_raw_data(r):
+ text += handle_t_tag(child, r, None, True, True, meta_data)
+ if r is None:
if has_text(el):
# If there is text in this hyperlink we need to raise an
# exception so that we don't lose content.
@@ -1224,7 +1239,10 @@ def get_p_data(p, meta_data, is_td=False):
# cleaning up old tags, as such this tag has no content and
# should be ignored.
continue
- el = child_run_tag
+ else:
+ t_el = r.find('%st' % w_namespace)
+ t_el.text = text
+ el = r
# t tags hold all the text content.
for child in get_raw_data(el):
@@ -3,16 +3,18 @@
templates = {
'drawing': 'drawing.xml',
'hyperlink': 'hyperlink.xml',
+ 'insert': 'insert.xml',
@winhamwr

winhamwr Mar 22, 2013

Member

What is this?

@jlward

jlward Mar 22, 2013

Member

This would be the ins tag which was never tested (acts the same as a smartTag

@winhamwr

winhamwr Mar 22, 2013

Member

Let's put that in the changelog entry, then.

@jlward

jlward Mar 22, 2013

Member

There is no point, it been supported for quite awhile, I am only now testing it.

@winhamwr

winhamwr Mar 22, 2013

Member

Gotcha. That makes sense, thanks.

'main': 'base.xml',
'p': 'p.xml',
'pict': 'pict.xml',
'r': 'r.xml',
'sectPr': 'sectPr.xml',
+ 'smartTag': 'smart_tag.xml',
+ 'style': 'style.xml',
+ 'styles': 'styles.xml',
'table': 'table.xml',
'tc': 'tc.xml',
'tr': 'tr.xml',
- 'styles': 'styles.xml',
- 'style': 'style.xml',
}
env = Environment(
@@ -64,6 +66,22 @@ def hyperlink_tag(self, r_id, run_tags):
}
return template.render(**kwargs)
+ @classmethod
+ def insert_tag(self, run_tags):
+ template = env.get_template(templates['insert'])
+ kwargs = {
+ 'run_tags': run_tags,
+ }
+ return template.render(**kwargs)
+
+ @classmethod
+ def smart_tag(self, run_tags):
+ template = env.get_template(templates['smartTag'])
+ kwargs = {
+ 'run_tags': run_tags,
+ }
+ return template.render(**kwargs)
+
@classmethod
def li(self, text, ilvl, numId, bold=False):
if isinstance(text, str):
@@ -0,0 +1,5 @@
+<w:ins>
+ {% for run_tag in run_tags %}
+ {{ run_tag }}
+ {% endfor %}
+</w:ins>
@@ -0,0 +1,5 @@
+<w:smartTag>
+ {% for run_tag in run_tags %}
+ {{ run_tag }}
+ {% endfor %}
+</w:smartTag>
@@ -6,10 +6,11 @@
from docx2html.core import (
_is_top_level_upper_roman,
create_html,
- get_style_dict,
get_font_size,
get_image_id,
get_li_nodes,
+ get_relationship_info,
+ get_style_dict,
get_namespace,
is_last_li,
)
@@ -307,6 +308,51 @@ def side_effect(*args, **kwargs):
''')
+class SkipImageTestCase(_TranslationTestCase):
+ relationship_dict = {
+ #'rId0': 'media/image1.svg',
@winhamwr

winhamwr Mar 22, 2013

Member

Why are these commented out?

+ #'rId1': 'media/image2.emf',
+ #'rId2': 'media/image3.wmf',
+ }
+ image_sizes = {
+ 'rId0': (4, 4),
+ 'rId1': (4, 4),
+ 'rId2': (4, 4),
+ }
+ expected_output = '<html></html>'
+
+ @staticmethod
+ def image_handler(image_id, relationship_dict):
+ return relationship_dict.get(image_id)
+
+ def get_xml(self):
+ tags = [
+ DXB.drawing('rId2'),
+ DXB.drawing('rId3'),
+ DXB.drawing('rId4'),
+ ]
+ body = ''
+ for el in tags:
+ body += el
+
+ xml = DXB.xml(body)
+ return etree.fromstring(xml)
+
+ def test_get_relationship_info(self):
+ tree = self.get_xml()
+ media = {
+ 'media/image1.svg': 'test',
+ 'media/image2.emf': 'test',
+ 'media/image3.wmf': 'test',
+ }
+ relationship_info = get_relationship_info(
+ tree,
+ media,
+ self.image_sizes,
+ )
+ self.assertEqual(relationship_info, {})
+
+
class ListWithContinuationTestCase(_TranslationTestCase):
expected_output = '''
<html>
@@ -443,6 +489,25 @@ def get_xml(self):
return etree.fromstring(xml)
+class NonStandardTextTagsTestCase(_TranslationTestCase):
+ expected_output = '''
+ <html>
+ <p>insert smarttag</p>
+ </html>
+ '''
+
+ def get_xml(self):
+ run_tags = [DXB.r_tag(i) for i in 'insert ']
+ insert_tag = DXB.insert_tag(run_tags)
+ run_tags = [DXB.r_tag(i) for i in 'smarttag']
+ smart_tag = DXB.smart_tag(run_tags)
+
+ run_tags = [insert_tag, smart_tag]
+ body = DXB.p_tag(run_tags)
+ xml = DXB.xml(body)
+ return etree.fromstring(xml)
+
+
class HyperlinkStyledTestCase(_TranslationTestCase):
relationship_dict = {
'rId0': 'www.google.com',
@@ -464,6 +529,26 @@ def get_xml(self):
return etree.fromstring(xml)
+class HyperlinkWithMultipleRunsTestCase(_TranslationTestCase):
+ relationship_dict = {
+ 'rId0': 'www.google.com',
+ }
+
+ expected_output = '''
+ <html>
+ <p><a href="www.google.com">link</a>.</p>
+ </html>
+ '''
+
+ def get_xml(self):
+ run_tags = [DXB.r_tag(i) for i in 'link']
+ run_tags = [DXB.hyperlink_tag(r_id='rId0', run_tags=run_tags)]
+ run_tags.append(DXB.r_tag('.', is_bold=False))
+ body = DXB.p_tag(run_tags)
+ xml = DXB.xml(body)
+ return etree.fromstring(xml)
+
+
class HyperlinkNoTextTestCase(_TranslationTestCase):
relationship_dict = {
'rId0': 'www.google.com',