From 6135ba63bb36eb4dcea5c448ac583c39732ba0dd Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Fri, 26 May 2017 17:09:17 +0300 Subject: [PATCH] Add whitespace even for inline tags Thanks @codinguncut for suggestion. Still needs testing. re.sub is replicating xpath's normalize-space behaviour. See GH-1 --- html_text/html_text.py | 9 ++++++++- tests/test_html_text.py | 5 +++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/html_text/html_text.py b/html_text/html_text.py index 532c3ac..dfce3ca 100644 --- a/html_text/html_text.py +++ b/html_text/html_text.py @@ -1,4 +1,6 @@ # -*- coding: utf-8 -*- +import re + import lxml import lxml.etree from lxml.html.clean import Cleaner @@ -41,8 +43,13 @@ def parse_html(html): def selector_to_text(sel): """ Convert a cleaned selector to text. + Almost the same as xpath normalize-space, but this also + adds spaces between inline elements (like ) which are + often used as block elements in html markup. """ - return sel.xpath('normalize-space()').extract_first('') + fragments = (re.sub('\s+', ' ', x.strip()) + for x in sel.xpath('//text()').extract()) + return ' '.join(x for x in fragments if x) def cleaned_selector(html): diff --git a/tests/test_html_text.py b/tests/test_html_text.py index eedba9a..b5078e3 100644 --- a/tests/test_html_text.py +++ b/tests/test_html_text.py @@ -23,3 +23,8 @@ def test_extract_text_from_tree(): html = u'

Hello, world!' tree = parse_html(html) assert extract_text(tree) == u'Hello, world!' + + +def test_inline_tags_whitespace(): + html = u'fieldvalue' + assert extract_text(html) == u'field value'