Skip to content

Commit

Permalink
guess_punct_space: remove whitespace before punct
Browse files Browse the repository at this point in the history
This is similar to webstruct.utils.smart_joins
(https://github.com/scrapinghub/webstruct/blob/5a3f39e2ec78a04ca021a12dff58f66686d86251/webstruct/utils.py#L61),
but is applied only on the tag boundaries.
This mode is just a little bit slower than default.
  • Loading branch information
lopuhin committed May 29, 2017
1 parent 43f1bd4 commit e833357
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 18 deletions.
33 changes: 26 additions & 7 deletions html_text/html_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,18 +41,36 @@ def parse_html(html):
return lxml.html.fromstring(html.encode('utf8'), parser=parser)


_whitespace = re.compile('\s+')
_whitespace = re.compile(r'\s+')
_trailing_whitespace = re.compile(r'\s$')
_punct_after = re.compile(r'[,:;.!?"\)]')
_punct_before = re.compile(r'[\(]')


def selector_to_text(sel):
def selector_to_text(sel, guess_punct_space=False):
""" Convert a cleaned selector to text.
Almost the same as xpath normalize-space, but this also
adds spaces between inline elements (like <span>) which are
often used as block elements in html markup.
"""
fragments = (_whitespace.sub(' ', x.strip())
for x in sel.xpath('//text()').extract())
return ' '.join(x for x in fragments if x)
if guess_punct_space:

def fragments():
prev = None
for text in sel.xpath('//text()').extract():
if prev is not None and (_trailing_whitespace.search(prev)
or (not _punct_after.match(text) and
not _punct_before.match(prev))):
yield ' '
yield text
prev = text

return _whitespace.sub(' ', ''.join(fragments()).strip())

else:
fragments = (_whitespace.sub(' ', x.strip())
for x in sel.xpath('//text()').extract())
return ' '.join(x for x in fragments if x)


def cleaned_selector(html):
Expand All @@ -70,10 +88,11 @@ def cleaned_selector(html):
return sel


def extract_text(html, encoding='utf8'):
def extract_text(html, guess_punct_space=False):
"""
Convert html to text.
html should be a unicode string or an already parsed lxml.html element.
"""
return selector_to_text(cleaned_selector(html))
sel = cleaned_selector(html)
return selector_to_text(sel, guess_punct_space=guess_punct_space)
41 changes: 30 additions & 11 deletions tests/test_html_text.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,49 @@
# -*- coding: utf-8 -*-
import pytest

from html_text import extract_text, parse_html


def test_extract_text():
@pytest.fixture(params=[{'guess_punct_space': True},
{'guess_punct_space': False}])
def all_options(request):
return request.param


def test_extract_text(all_options):
html = u'<html><style>.div {}</style><body><p>Hello, world!</body></html>'
assert extract_text(html) == u'Hello, world!'
assert extract_text(html, **all_options) == u'Hello, world!'


def test_declared_encoding():
def test_declared_encoding(all_options):
html = (u'<?xml version="1.0" encoding="utf-8" ?>'
u'<html><style>.div {}</style>'
u'<body>Hello, world!</p></body></html>')
assert extract_text(html) == u'Hello, world!'
assert extract_text(html, **all_options) == u'Hello, world!'


def test_empty():
assert extract_text(u'') == ''
def test_empty(all_options):
assert extract_text(u'', **all_options) == ''


def test_extract_text_from_tree():
def test_extract_text_from_tree(all_options):
html = u'<html><style>.div {}</style><body><p>Hello, world!</body></html>'
tree = parse_html(html)
assert extract_text(tree) == u'Hello, world!'
assert extract_text(tree, **all_options) == u'Hello, world!'


def test_inline_tags_whitespace(all_options):
html = u'<span>field</span><span>value of</span><span></span>'
assert extract_text(html, **all_options) == u'field value of'


def test_punct_whitespace():
html = u'<div><span>field</span>, and more</div>'
assert extract_text(html) == u'field , and more'


def test_inline_tags_whitespace():
html = u'<span>field</span><span>value</span>'
assert extract_text(html) == u'field value'
def test_punct_whitespace_preserved():
html = (u'<div><span>по</span><span>ле</span>, and , '
u'<span>more </span>!<span>now</div>(<b>boo</b>)')
assert (extract_text(html, guess_punct_space=True) ==
u'по ле, and , more ! now (boo)')

0 comments on commit e833357

Please sign in to comment.