Skip to content

Commit

Permalink
improved utils.download
Browse files Browse the repository at this point in the history
Previously it had troubles with HTML pages where encoding
is specified in response body meta tag.
  • Loading branch information
kmike committed Feb 1, 2016
1 parent 1d5f147 commit ba9b432
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 1 deletion.
22 changes: 21 additions & 1 deletion formasaurus/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import sys

import requests
from requests.compat import chardet
from w3lib.encoding import html_to_unicode
import tldextract


Expand Down Expand Up @@ -87,4 +89,22 @@ def download(url):
Download a web page from url, return its content as unicode.
"""
url = add_scheme_if_missing(url)
return requests.get(url).text
resp = requests.get(url)
return response2unicode(resp)


def response2unicode(resp):
"""
Convert requests.Response body to unicode.
Unlike ``response.text`` it handles <meta> tags in response content.
"""
enc, html = html_to_unicode(
content_type_header=resp.headers.get('Content-Type'),
html_body_str=p.content,
auto_detect_fun=_autodetect_encoding
)
return html


def _autodetect_encoding(binary_data):
return chardet.detect(binary_data)['encoding']
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def get_version():
"docopt",
"six",
"requests",
"w3lib >= 1.13.0",
],
package_data={
'formasaurus': [
Expand Down

0 comments on commit ba9b432

Please sign in to comment.