/
html_to_text.py
145 lines (102 loc) · 4.22 KB
/
html_to_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/env python3
""" This module has classes for converting HTML strings and files to plain text versions.
TODO:
- Is there any way to use io.StringIO instead of a temp file?
- this might help avoid the permision error.
"""
# import modules.
import codecs
import os
import subprocess
from bs4 import BeautifulSoup
class ModifyHTML(BeautifulSoup):
""" A class with tools to modify the HTML DOM via BeautifulSoup.
Example:
>>> html = open("sample.html").read() # string
>>> html = ModifyHTML(html, "html5lib") #BeautifulSoup object
>>> html.shift_links() # alters DOM
>>> html.remove_images() # alters DOM
>>> html.raw() # back to string ...
"""
def shift_links(self):
""" Appends A.href value to A tag's text for A tags in BeautifulSoup instance.
i.e. <a href="bar">foo</a> to <a href="bar">foo [bar]</a>
"""
a_tags = self.find_all("a")
for a_tag in a_tags:
if a_tag.string == None:
continue
if "href" not in a_tag.attrs:
continue
href = a_tag["href"]
if href[0:4] == "http":
text = a_tag.string + " [" + href + "]"
a_tag.string.replace_with(text)
return self
def remove_images(self):
""" Removes image tags from BeautifulSoup instance. """
img_tags = self.find_all("img")
for img_tag in img_tags:
img_tag.extract()
return self
def raw(self):
""" Returns string version of BeautifulSoup instance. """
return str(self)
class HTMLToText():
""" A class to convert HTML files OR strings to plain text via the Lynx browser.
Examples:
>>> h2t = HTMLToText()
>>> ht2.text("sample.html")
# returns plain text version of "sample.html".
>>> ht2.text("<p class='hi'>Hello World!</p>", is_raw=True)
'\nHello World!\n\n'
"""
def __init__(self, custom_options=None, temp_file="_tmp.html"):
""" Sets instance attributes.
Args:
- custom_options (dict): Custom Lynx options per: http://lynx.browser.org/lynx2.8.8/breakout/lynx_help/Lynx_users_guide.html#InteractiveOptions (Retrieved: April 2017).
- temp_file (str): File in which to store raw HTML strings.
"""
# set default options for Lynx.
options = {"nolist":True, "nomargins":True, "dump":True}
# add in custom options.
if isinstance(custom_options, dict):
for key, val in custom_options.items():
options[key] = val
self.options = options
# set persistent temporary file name.
self.temp_file = temp_file
def __del__(self):
""" Trys to remove temporary file if it exists. Passes on permission error. """
if os.path.isfile(self.temp_file):
try:
os.remove(self.temp_file)
except PermissionError:
pass
def text(self, html, is_raw=False, charset="utf-8"):
""" Converts HTML files OR strings to plain text via the Lynx browser.
Args:
- html (str): The HTML file OR the raw HTML string to convert to text.
- is_raw (bool): If True, @html is saved to self.temp_file prior to conversion.
- charset (str): The encoding for the converted text.
"""
# create beginning Lynx command line snippet.
arg_options = [key for key, val in self.options.items() if val]
args = "lynx -"
args += " -".join(arg_options)
# if @is_raw == True, write @html to temporary file.
# complete command line snippet.
if is_raw:
with codecs.open(self.temp_file, "w", encoding=charset) as tmp:
tmp.write(html)
args += " " + self.temp_file
else:
args += " " + html
# run Lynx.
cmd = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
# return stdout.
if cmd.returncode == 0:
stdout = cmd.stdout.decode(encoding=charset, errors="backslashreplace")
else:
stdout = None
return stdout