-
Notifications
You must be signed in to change notification settings - Fork 0
/
markdown_maker.py
executable file
·438 lines (401 loc) · 18.3 KB
/
markdown_maker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
"""
desc:
将html内容转为markdown内容, 图片上传到七牛
auth:
Alan
requirements:
pip install qiniu requests beautifulsoup4
how to use:
查看README.md
"""
import re
import warnings
import random
from qiniu import Auth, put_file, etag, urlsafe_base64_encode
import os
import hashlib
import pathlib
import requests
import shutil
from mimetypes import guess_extension
from bs4 import BeautifulSoup
import configparser
# markdown 标签
MARKDOWN_ELEMENTS = {
'h1': ('\n# ', '\n'),
'h2': ('\n## ', '\n'),
'h3': ('\n### ', '\n'),
'h4': ('\n#### ', '\n'),
'h5': ('\n##### ', '\n'),
'h6': ('\n###### ', '\n'),
'code': ('`', '`'),
'ul': ('', ''),
'ol': ('', ''),
'li': ('- ', ''),
'blockquote': ('\n> ', '\n'),
'em': ('*', '*'),
'strong': ('**', '**'),
'block_code': ('\n```\n', '\n```\n'),
'span': ('', ''),
'p': ('\n', '\n'),
'p_with_out_class': ('\n', '\n'),
'inline_p': ('', ''),
'inline_p_with_out_class': ('', ''),
'b': ('**', '**'),
'i': ('*', '*'),
'del': ('~~', '~~'),
'hr': ('\n---', '\n\n'),
'thead': ('\n', '|------\n'),
'tbody': ('\n', '\n'),
'td': ('|', ''),
'th': ('|', ''),
'tr': ('', '\n'),
'table': ('', '\n'),
'e_p': ('', '\n')
}
# 外部标签
OUTLINE_ELEMENTS = {
'h1': '<h1.*?>(.*?)</h1>',
'h2': '<h2.*?>(.*?)</h2>',
'h3': '<h3.*?>(.*?)</h3>',
'h4': '<h4.*?>(.*?)</h4>',
'h5': '<h5.*?>(.*?)</h5>',
'h6': '<h6.*?>(.*?)</h6>',
'hr': '<hr/>',
'blockquote': '<blockquote.*?>(.*?)</blockquote>',
'ul': '<ul.*?>(.*?)</ul>',
'ol': '<ol.*?>(.*?)</ol>',
# 'block_code': '<pre.*?><code.*?>(.*?)</code></pre>',
'block_code': '<pre(.*?)>(.*?)</pre>',
'p': '<p\s.*?>(.*?)</p>',
'p_with_out_class': '<p>(.*?)</p>',
'thead': '<thead.*?>(.*?)</thead>',
'tr': '<tr.*?>(.*?)</tr>'
}
# 内嵌标签
INLINE_ELEMENTS = {
'td': '<td.*?>((.|\n)*?)</td>', # td element may span lines
'tr': '<tr.*?>((.|\n)*?)</tr>',
'th': '<th.*?>(.*?)</th>',
'b': '<b.*?>(.*?)</b>',
'i': '<i.*?>(.*?)</i>',
'del': '<del.*?>(.*?)</del>',
'inline_p': '<p\s.*?>(.*?)</p>',
'inline_p_with_out_class': '<p>(.*?)</p>',
'code': '<code.*?>(.*?)</code>',
'span': '<span.*?>(.*?)</span>',
'ul': '<ul.*?>(.*?)</ul>',
'ol': '<ol.*?>(.*?)</ol>',
'li': '<li.*?>(.*?)</li>',
'img': '<img.*?src="(.*?)".*?>(.*?)</img>',
'img_single': '<img.*?src="(.*?)".*?/>',
'img_single_no_close': '<img.*?src="(.*?)".*?>',
'a': '<a.*?href="(.*?)".*?>(.*?)</a>',
'em': '<em.*?>(.*?)</em>',
'strong': '<strong.*?>(\s*)(.*?)(\s*)</strong>',
'tbody': '<tbody.*?>((.|\n)*)</tbody>',
}
# 需要删除的标签
DELETE_ELEMENTS = [
'<span.*?>',
'</span>',
'<div.*?>',
'</div>',
'<br clear="none"/>',
'<center.*?>',
'</center>'
]
tmp_files_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'tmp_files')
md_files_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'md_files')
# 生成随机headers
def rand_header():
head_connection = ['Keep-Alive', 'close']
head_accept = ['text/html, application/xhtml+xml, */*']
head_accept_language = ['zh-CN,fr-FR;q=0.5', 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3']
head_user_agent = ['Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']
header = {
'Connection': head_connection[0],
'Accept': head_accept[0],
'Accept-Language': head_accept_language[1],
'User-Agent': head_user_agent[random.randrange(0, len(head_user_agent))]
}
return header
# 七牛
class QiNiu(object):
def __init__(self, access_key, secret_key, domain, bucket_name):
self.access_key = access_key
self.secret_key = secret_key
self.auth = self.get_auth()
self.token = None
self.domain = domain
self.bucket_name = bucket_name
def get_auth(self):
q = Auth(self.access_key, self.secret_key)
return q
def get_token(self, key_name):
self.token = self.auth.upload_token(self.bucket_name, key_name, 3600)
def upload_file(self, local_file, key_name=None):
if not key_name:
ext = pathlib.Path(local_file).suffix
hash_name = self.hash_file(local_file)
key_name = hash_name + ext
if not self.token:
self.get_token(key_name)
ret, info = put_file(self.token, key_name, local_file)
return ret, info
def hash_file(self, local_file):
hasher = hashlib.md5()
with open(local_file, 'rb') as f:
buf = f.read()
hasher.update(buf)
return hasher.hexdigest()
def upload_url_file(self, pic_url, save_path):
try:
r = requests.get(pic_url, stream=True)
if r.status_code == 200:
guess = guess_extension(r.headers['content-type'])
save_file = os.path.join(save_path, 'tmp' + guess)
with open(save_file, 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
ret, info = self.upload_file(save_file)
if info.status_code == 200:
return self.domain+ret['key']
else:
return None
except:
return pic_url
# 生成标签元素
class Element(object):
def __init__(self, start_pos, end_pos, content, tag, class_=None, qu_niu=None):
self.start_pos = start_pos # 标签起始位置
self.end_pos = end_pos # 标签结束位置
self.content = content.strip() # 标签内容
self.tag = tag # 标签名称
self.class_ = class_
self.qi_niu = qu_niu
self.parse_inline() # 解析内嵌标签
def __str__(self):
# 返回markdown格式的内容/判断block_code情况
if self.tag == 'block_code' and self.class_:
try:
lang_code = re.findall(r'class="brush:(.*?);toolbar:false"', self.class_)[0]
wrapper = MARKDOWN_ELEMENTS.get(self.tag)
self._result = '\n```{}\n{}{}'.format(lang_code, self.content, wrapper[1])
except:
wrapper = MARKDOWN_ELEMENTS.get(self.tag)
self._result = '{}{}{}'.format(wrapper[0], self.content, wrapper[1])
else:
wrapper = MARKDOWN_ELEMENTS.get(self.tag)
self._result = '{}{}{}'.format(wrapper[0], self.content, wrapper[1])
return self._result
# 解析主方法
def parse_inline(self):
# 转义字符
self.content = self.content.replace('\r', '') # windows \r character
self.content = self.content.replace('\xc2\xa0', ' ') # no break space
self.content = self.content.replace('"', '\"') # html quote mark
self.content = self.content.replace(' ', '') # non-breaking space
self.content = self.content.replace(''', '\'') # apostrophe
self.content = self.content.replace('<', '<')
self.content = self.content.replace('>', '>')
if self.tag == "table": # for removing tbody
self.content = re.sub(INLINE_ELEMENTS['tbody'], '\g<1>', self.content)
INLINE_ELEMENTS_LIST_KEYS = list(INLINE_ELEMENTS.keys())
INLINE_ELEMENTS_LIST_KEYS.sort()
for tag in INLINE_ELEMENTS_LIST_KEYS:
pattern = INLINE_ELEMENTS[tag]
if tag == 'a':
self.content = re.sub(pattern, '[\g<2>](\g<1>)', self.content, count=re.M, flags=re.S)
# 上传图片至七牛并返回url生成到markdown 文档
elif tag == 'img':
result = re.findall(pattern, self.content)
if result:
try:
img_url = self.qi_niu.upload_url_file(pic_url=result[0][0], save_path=tmp_files_path)
except:
img_url = None
if img_url:
self.content = re.sub(pattern, '![\g<2>]({})'.format(img_url), self.content)
else:
self.content = re.sub(pattern, '![\g<2>](\g<1>)', self.content, count=re.M, flags=re.S)
else:
self.content = re.sub(pattern, '![\g<2>](\g<1>)', self.content, count=re.M, flags=re.S)
elif tag == 'img_single':
result = re.findall(pattern, self.content)
if result:
try:
img_url = self.qi_niu.upload_url_file(pic_url=result[0], save_path=tmp_files_path)
except:
img_url = None
if img_url:
self.content = re.sub(pattern, '![]({})'.format(img_url), self.content, count=re.M, flags=re.S)
else:
self.content = re.sub(pattern, '![](\g<1>)', self.content)
else:
self.content = re.sub(pattern, '![](\g<1>)', self.content, count=re.M, flags=re.S)
elif tag == 'img_single_no_close':
result = re.findall(pattern, self.content)
if result:
try:
img_url = self.qi_niu.upload_url_file(pic_url=result[0], save_path=tmp_files_path)
except:
img_url = None
if img_url:
self.content = re.sub(pattern, '![]({})'.format(img_url), self.content, count=re.M, flags=re.S)
else:
self.content = re.sub(pattern, '![](\g<1>)', self.content)
else:
self.content = re.sub(pattern, '![](\g<1>)', self.content)
elif self.tag == 'ul' and tag == 'li':
self.content = re.sub(pattern, '- \g<1>', self.content)
elif self.tag == 'ol' and tag == 'li':
self.content = re.sub(pattern, '1. \g<1>', self.content)
elif self.tag == 'thead' and tag == 'tr':
self.content = re.sub(pattern, '\g<1>\n', self.content.replace('\n', ''))
elif self.tag == 'tr' and tag == 'th':
self.content = re.sub(pattern, '|\g<1>', self.content.replace('\n', ''))
elif self.tag == 'tr' and tag == 'td':
self.content = re.sub(pattern, '|\g<1>|', self.content.replace('\n', ''))
self.content = self.content.replace("||", "|") # end of column also needs a pipe
elif self.tag == 'table' and tag == 'td':
self.content = re.sub(pattern, '|\g<1>|', self.content)
self.content = self.content.replace("||", "|") # end of column also needs a pipe
self.content = self.content.replace('|\n\n', '|\n') # replace double new line
self.construct_table()
else:
wrapper = MARKDOWN_ELEMENTS.get(tag)
if tag == 'strong':
self.content = re.sub(pattern, '{}\g<2>{}'.format(wrapper[0], wrapper[1]), self.content)
else:
self.content = re.sub(pattern, '{}\g<1>{}'.format(wrapper[0], wrapper[1]), self.content)
# 生成markdown格式的table
def construct_table(self):
# this function, after self.content has gained | for table entries,
# adds the |---| in markdown to create a proper table
count = 1
temp = self.content.split('\n', 3)
for elt in temp:
if elt != "":
count = elt.count("|") # count number of pipes
break
pipe = "\n|" # beginning \n for safety
for i in range(count - 1):
pipe += "---|"
pipe += "\n"
self.content = pipe + pipe + self.content + "\n" # TODO: column titles?
self.content = self.content.replace('|\n\n', '|\n') # replace double new line
self.content = self.content.replace("<br/>\n", "<br/>") # end of column also needs a pipe
class MarkdownMaker(object):
def __init__(self, html='', folder='', file='', qi_niu=None):
self.html = html # actual data
self.folder = folder
self.file = file
self.qi_niu = qi_niu
def convert(self, html=''):
if html == '':
html = self.html
# main function here
elements = []
for tag, pattern in OUTLINE_ELEMENTS.items():
# re.I 忽略大小写/ re.M 多行模式/ re.S 即为'.'并且包括换行符在内的任意字符('.'不包括换行符)
for m in re.finditer(pattern, html, re.I | re.S | re.M):
# now m contains the pattern without the tag
if tag == 'block_code':
try:
content = ''.join(m.groups()[1])
class_ = m.groups()[0]
except:
content = ''.join(m.groups())
class_ = None
else:
content = ''.join(m.groups())
class_ = None
element = Element(start_pos=m.start(),
end_pos=m.end(),
content=content,
tag=tag,
class_=class_,
qu_niu=self.qi_niu
)
can_append = True
# 如果当前匹配的内容已在解析的elements组内的某一个element里,则无需添加至elements
# 如果当前匹配的内容包含某个已解析在elements组内的某一个element, 则移除这个element,并将新匹配的内容添加至elements
for e in elements:
if e.start_pos < m.start() and e.end_pos > m.end():
can_append = False
elif e.start_pos > m.start() and e.end_pos < m.end():
elements.remove(e)
if can_append:
elements.append(element)
# 根据起始位置对内容排序并拼接
elements.sort(key=lambda element: element.start_pos)
self._markdown = ''.join([str(e) for e in elements])
# 删除指定标签内容
for index, element in enumerate(DELETE_ELEMENTS):
self._markdown = re.sub(element, '', self._markdown)
return self._markdown
@property
def markdown(self):
self.convert(self.html)
return self._markdown
def export(self, folder=False):
if len(self.file) < 1:
warnings.warn("file not specified, renamed to tmp.md")
file = "tmp.md"
else:
file = self.file.replace('.html', '.md') # rename to md
if len(self.folder) < 2:
warnings.warn("folder not specified, will save to pwd")
elif not folder:
file = self.folder + '/' + file
else: # if folder is specified
file = folder + '/' + file
f = open(file, 'w')
f.write(self._markdown)
f.close()
if __name__ == '__main__':
try:
config = configparser.ConfigParser()
config.read('config.txt')
access_key = config.get('QINIU', 'access_key')
secret_key = config.get('QINIU', 'secret_key')
domain = config.get('QINIU', 'domain')
bucket_name = config.get('QINIU', 'bucket_name')
upload_qiniu = True
except Exception as error:
upload_qiniu = False
access_key, secret_key, domain, bucket_name = '', '', '', ''
html_url = 'http://fualan.com/blog/article/12/'
article = requests.get(html_url)
soup = BeautifulSoup(article.text, "html.parser")
article_content = soup.find("section", class_="article typo")
if access_key and secret_key and domain and bucket_name:
# 图片上传到七牛, 指定access_key, secret_key, bucket_name
qi_niu = QiNiu(access_key='access_key',
secret_key='secret_key',
domain='http://domain/', bucket_name='bucket')
markdown_content = MarkdownMaker(article_content.__str__(), qi_niu=qi_niu).markdown
print(markdown_content)
else:
# 转换格式
markdown_content = MarkdownMaker(article_content.__str__()).markdown
print(markdown_content)