-
Notifications
You must be signed in to change notification settings - Fork 12
/
WebSite-Downloader.py
421 lines (385 loc) · 13 KB
/
WebSite-Downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
"""
网站下载器
"""
__author__ = 'Stardust1001'
from urllib import request, error
from urllib.request import Request, urlopen, urljoin, urlretrieve, urlparse
import os, shutil, re, time, threading, http
from http import cookiejar
from queue import Queue, Empty
import logging
import socket
socket.setdefaulttimeout(20)
import ssl
try:
_create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
pass
else:
ssl._create_default_https_context = _create_unverified_https_context
def init_opener():
cookie = cookiejar.CookieJar()
cookie_support = request.HTTPCookieProcessor(cookie)
return request.build_opener(cookie_support)
opener = init_opener()
def init_logger():
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
file_handler = logging.FileHandler('log.log', mode='w', encoding='UTF-8')
file_handler.setLevel(logging.NOTSET)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
console_handler.setFormatter(formatter)
file_handler.setFormatter(formatter)
logger.addHandler(console_handler)
logger.addHandler(file_handler)
return logger
logger = init_logger()
class Manager:
"""
爬虫主线程的管理器
从子线程里获取新的链接,处理后添加进要爬取的链接 Queue 队列
子线程从主线程提供的链接 Queue 队列获取链接进行爬取
"""
def __init__(self, home_url):
# 爬取网站域名的各个子域名
# 下载的网站的根文件夹,网站可能有不同子域名,提供一个更高级的文件夹路径 -site
home_dir = '{0}-site/{1}'.format(home_url.split('.')[1], home_url.split('/')[2])
# home_dir = '/Users/liebeu/Desktop/localhost-site/localhost'
if os.path.exists(home_dir):
shutil.rmtree(os.path.dirname(home_dir))
os.makedirs(home_dir)
parsed_url = urlparse(home_url)
scheme = parsed_url.scheme
# 爬取的网站的顶级域名
top_domain = '.'.join(parsed_url.netloc.split('.')[1:])
# 每个请求最大尝试次数
max_tries = 3
# 要爬取的链接 Queue 队列
self.link_queue = Queue()
self.link_queue.put(home_url)
# 链接 set ,对新连接进行唯一性判断,然后添加进 Queue 队列
self.links = set([home_url])
# 子线程爬虫列表
self.spiders = []
# 默认开启 8 个子线程
for i in range(8):
self.spiders.append(Spider(home_dir, home_url, self.link_queue, scheme, top_domain, max_tries))
def start(self):
"""
开启主线程的爬虫管理器
"""
for spider in self.spiders:
spider.start()
# 上次有新链接的时间,默认延时 60 秒,超过时间就结束程序
last_new_time = time.time()
# 从子线程获取新链接,添加进 Queue 队列
while True:
for spider in self.spiders:
new_links = spider.get_links()
if new_links:
last_new_time = time.time()
for link in new_links:
if not link in self.links and len(link) < 250:
sharp_index = link.find('#')
if sharp_index > 0:
link = link[0:sharp_index]
self.links.add(link)
self.link_queue.put(link, True)
if time.time() - last_new_time >= 60:
break
# 响铃提醒下载完成
for i in range(10):
print('\a')
time.sleep(0.5)
class Spider(threading.Thread):
"""
爬虫线程
从主线程获取链接进行爬取,并处理 html 、css 文件获取新链接,以及直接下载其他文件
"""
def __init__(self, home_dir, home_url, link_queue, scheme, top_domain, max_tries):
threading.Thread.__init__(self)
self.home_dir = home_dir
self.home_url = home_url
self.link_queue = link_queue
self.scheme = scheme
self.top_domain = top_domain
self.max_tries = max_tries
# 直接下载的其他文件格式
self.other_suffixes = set([
'js', 'jpg', 'png', 'gif', 'svg', 'json', 'xml', 'ico', 'jpeg', 'ttf', 'mp3', 'mp4', 'wav',
'doc', 'xls', 'pdf', 'docx', 'xlsx', 'eot', 'woff', 'csv', 'swf', 'tar', 'gz', 'zip', 'rar', 'txt',
'exe', 'ppt', 'pptx', 'm3u8', 'avi', 'wsf'
])
self.media_suffixes = set(['mp3', 'mp4', 'pdf', 'gz', 'tar', 'zip', 'rar', 'wav', 'm3u8', 'avi'])
# 域名名称
self.domain_names = set(['com', 'cn', 'net', 'org', 'gov', 'io'])
# html 内容里的链接匹配
self.html_pat = re.compile(r'(href|src)=(\"|\')([^\"\']*)')
# css 内容里的链接匹配
self.css_pat = re.compile(r'url\((\"|\')([^\"\']*)')
self.links = set()
def run(self):
logger.info('{0} start.'.format(threading.current_thread().name))
# 尝试从主线程的链接队列获取新链接,默认延时 60 秒结束线程
while True:
try:
link = self.link_queue.get(timeout=60)
self.spide(link)
except Empty:
break
logger.info('{0} end.'.format(threading.current_thread().name))
def spide(self, link):
# 爬取链接,对不同链接不同处理
try:
suffix = link.split('.')[-1].lower()
if suffix == 'css':
self.handle_css(link)
elif suffix in self.other_suffixes:
self.download(link)
else:
self.handle_html(link)
except:
logger.error('[Unknown Error]\t{0}'.format(link))
def handle_html(self, link):
# 处理 html 链接
html = self.get_res(link)
if html is None:
return
html_raw_links = set([ele[2] for ele in self.html_pat.findall(html)])
html_raw_links = html_raw_links.union([ele[1] for ele in self.css_pat.findall(html)])
if html_raw_links:
# 提取有效的链接
valid_links = list(filter(self.is_valid_link, html_raw_links))
# 对有效的链接进行处理
handled_links = list(map(self.handle_valid_link, valid_links))
# 把有效的链接放入线程的 links ,供主线程爬虫管理器获取
self.links = self.links.union([urljoin(link, t_link) for t_link in handled_links])
# 替换 html 内容里的链接为本地网站文件夹里的相对路径
html = self.replace_links(html, valid_links, self.normalize_link(link))
# 保存 html 文件
with open(self.make_filepath(self.normalize_link(link)), 'w') as f_w:
f_w.write(html)
logger.info('Handled\t{0}'.format(link))
def handle_css(self, link):
"""
处理 css 链接
"""
text = self.get_res(link)
if text is None:
return
css_raw_links = set([ele[1] for ele in self.css_pat.findall(text)])
if css_raw_links:
css_raw_links = list(filter(self.is_valid_link, css_raw_links))
self.links = self.links.union([urljoin(link, t_link) for t_link in css_raw_links])
text = self.replace_links(text, css_raw_links, self.normalize_link(link))
with open(self.make_filepath(self.normalize_link(link)), 'w') as f_w:
f_w.write(text)
logger.info('Handled\t{0}'.format(link))
def is_valid_link(self, link):
"""
检测有效链接
嵌入的 data:image 图片不作为新链接
os.path.relpath 返回值最前面多一个 . 需要删掉
"""
if link.find('javascript:') >= 0 or link.find('@') >= 0 or link.find('data:image') >= 0:
return False
if link.find('http') >= 0:
netloc = urlparse(link).netloc
if netloc:
if netloc.find(':80') > 0:
netloc = netloc.replace(':80', '')
return netloc[netloc.find('.') + 1:] == self.top_domain
return True
def handle_valid_link(self, link):
"""
处理链接的错误 协议 写法
http:www.baidu.com http:/www.baidu.com 转换为 http://www.baidu.com
"""
if not link:
return link
if link[0:2] == '//':
return self.scheme + link
if link[0] == '/':
return urljoin(self.home_url, link)
if link.find('http') < 0 or link.find('http://') >= 0 or link.find('https://') >= 0:
return link
if link.find('http:/') >= 0 or link.find('https:/') >= 0:
return link.replace(':/', '://')
if link.find('http:') >= 0 or link.find('https:') >= 0:
first_colon = link.find(':')
link = link[0:first_colon] + '://' + link[first_colon + 1:]
return link
return link
def get_res(self, link):
"""
获取 html 、 css 链接的响应
"""
num_tries = 0
# 多次尝试获取
while num_tries < self.max_tries:
try:
res = opener.open(Request(link)).read()
break
except error.HTTPError:
logger.error('[error.HTTPError]\t{0}'.format(link))
return None
except error.URLError:
logger.error('[error.URLError]\t{0}'.format(link))
return None
except UnicodeEncodeError:
logger.error('[UnicodeEncodeError]\t{0}'.format(link))
return None
except http.client.BadStatusLine:
logger.error('[http.client.BadStatusLine]\t{0}'.format(link))
return None
except http.client.IncompleteRead:
logger.error('[http.client.IncompleteRead]\t{0}'.format(link))
return None
except TimeoutError:
logger.error('[TimeoutError]\t{0}'.format(link))
num_tries += 1
except socket.timeout:
logger.error('[socket.timeout]\t{0}'.format(link))
num_tries += 1
except http.client.RemoteDisconnected:
logger.error('[RemoteDisconnected]\t{0}'.format(link))
num_tries += 1
except ConnectionResetError:
logger.error('[ConnectionResetError]\t{0}'.format(link))
num_tries += 1
if num_tries >= self.max_tries:
logger.warning('[failed get]\t{0}'.format(link))
return None
# 解码响应内容
try:
text = res.decode('utf-8')
return text
except UnicodeDecodeError:
pass
try:
text = res.decode('gb2312')
return text
except UnicodeDecodeError:
pass
try:
text = res.decode('gbk')
return text
except UnicodeDecodeError:
pass
logger.error('[UnicodeDecodeError]\t{0}'.format(link))
return None
def download(self, link):
"""
直接下载其他格式的文件
"""
socket.setdefaulttimeout(20)
if link.split('.')[-1].lower() in self.media_suffixes:
socket.setdefaulttimeout(600)
num_tries = 0
# 多次尝试下载
while num_tries < self.max_tries:
try:
urlretrieve(link, self.make_filepath(link))
break
except error.HTTPError:
logger.error('[error.HTTPError]\t{0}'.format(link))
break
except error.URLError:
logger.error('[error.URLError]\t{0}'.format(link))
break
except UnicodeEncodeError:
logger.error('[UnicodeEncodeError]\t{0}'.format(link))
break
except http.client.BadStatusLine:
logger.error('[http.client.BadStatusLine]\t{0}'.format(link))
break
except http.client.IncompleteRead:
logger.error('[http.client.IncompleteRead]\t{0}'.format(link))
break
except TimeoutError:
logger.error('[TimeoutError]\t{0}'.format(link))
num_tries += 1
except socket.timeout:
logger.error('[socket.timeout]\t{0}'.format(link))
num_tries += 1
except http.client.RemoteDisconnected:
logger.error('[RemoteDisconnected]\t{0}'.format(link))
num_tries += 1
except ConnectionResetError:
logger.error('[ConnectionResetError]\t{0}'.format(link))
num_tries += 1
if num_tries >= self.max_tries:
logger.warning('[failed download]\t{0}'.format(link))
logger.info('Downloaded\t{0}'.format(link))
def make_filepath(self, link):
"""
把链接创建为本地网站文件夹的绝对路径
"""
# 需要的话创建新文件夹
abs_filepath = self.get_abs_filepath(link)
dirname = os.path.dirname(abs_filepath)
if not os.path.exists(dirname):
try:
os.makedirs(dirname)
except FileExistsError:
pass
except NotADirectoryError:
logger.error('[NotADirectoryError]\t{0}\t{1}'.format(link, abs_filepath))
return abs_filepath
def get_abs_filepath(self, link):
"""
把链接转换为本地网站文件夹的绝对路径
"""
old_link = link
if link[-1] == '/':
link += 'index.html'
elif link.split('.')[-1] in self.domain_names:
link += '/index.html'
rel_url = os.path.relpath(link, self.home_url)
if rel_url.find('?') >= 0:
rel_url += '.html'
if rel_url.split('/')[-1].find('.') < 0 or rel_url == '.':
rel_url += 'index.html'
abs_filepath = os.path.join(self.home_dir, rel_url)
if abs_filepath.find('..') > 0:
parts = abs_filepath.split('..')
abs_filepath = '/'.join(parts[0].split('/')[0:-2]) + parts[1]
if os.path.isdir(abs_filepath):
logger.warning('[isdir]\t{0}\t{1}'.format(old_link, abs_filepath))
abs_filepath = os.path.join(abs_filepath, 'index.html')
return abs_filepath
def replace_links(self, content, links, cur_url):
"""
替换 html 、 css 内容里的链接
"""
links.sort(key=lambda link: len(link), reverse=True)
for link in set(links):
link_abspath = self.get_abs_filepath(urljoin(cur_url, self.normalize_link(link)))
cur_url_abspath = self.get_abs_filepath(cur_url)
rel_link = os.path.relpath(link_abspath, cur_url_abspath)[1:].replace('?', '%3F')
replacement = '"{0}"'.format(rel_link)
content = content.replace(
'"{0}"'.format(link),replacement
).replace('\'{0}\''.format(link), replacement)
return content
def normalize_link(self, link):
if link.find('http') < 0:
return link
if link.find(':80') > 0:
link = link.replace(':80', '')
first_colon = link.find(':')
link = self.scheme + link[first_colon:]
return link
def get_links(self):
"""
主线程爬虫管理器从这里获取爬虫子线程的新链接
获取后子线程就删除旧链接,为后面获取的链接做准备
"""
export_links = self.links.copy()
self.links.clear()
return export_links
if __name__ == '__main__':
manager = Manager('http://www.whsw.net/')
manager.start()