From 6135ba63bb36eb4dcea5c448ac583c39732ba0dd Mon Sep 17 00:00:00 2001
From: Konstantin Lopuhin <kostia.lopuhin@gmail.com>
Date: Fri, 26 May 2017 17:09:17 +0300
Subject: [PATCH] Add whitespace even for inline tags

Thanks @codinguncut for suggestion. Still needs testing.
re.sub is replicating xpath's normalize-space behaviour.
See GH-1
---
 html_text/html_text.py  | 9 ++++++++-
 tests/test_html_text.py | 5 +++++
 2 files changed, 13 insertions(+), 1 deletion(-)
diff --git a/html_text/html_text.py b/html_text/html_text.py
index 532c3ac..dfce3ca 100644
--- a/html_text/html_text.py
+++ b/html_text/html_text.py
@@ -1,4 +1,6 @@
 # -*- coding: utf-8 -*-
+import re
+
 import lxml
 import lxml.etree
 from lxml.html.clean import Cleaner
@@ -41,8 +43,13 @@ def parse_html(html):
 
 def selector_to_text(sel):
     """ Convert a cleaned selector to text.
+    Almost the same as xpath normalize-space, but this also
+    adds spaces between inline elements (like <span>) which are
+    often used as block elements in html markup.
     """
-    return sel.xpath('normalize-space()').extract_first('')
+    fragments = (re.sub('\s+', ' ', x.strip())
+                 for x in sel.xpath('//text()').extract())
+    return ' '.join(x for x in fragments if x)
 
 
 def cleaned_selector(html):
diff --git a/tests/test_html_text.py b/tests/test_html_text.py
index eedba9a..b5078e3 100644
--- a/tests/test_html_text.py
+++ b/tests/test_html_text.py
@@ -23,3 +23,8 @@ def test_extract_text_from_tree():
     html = u'<html><style>.div {}</style><body><p>Hello,   world!</body></html>'
     tree = parse_html(html)
     assert extract_text(tree) == u'Hello, world!'
+
+
+def test_inline_tags_whitespace():
+    html = u'<span>field</span><span>value</span>'
+    assert extract_text(html) == u'field value'