-
Notifications
You must be signed in to change notification settings - Fork 14
/
test.py
73 lines (65 loc) · 2.45 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# import builtwith # 检测网页设计所用到的技术
# import whois
import urllib.request
import urllib.robotparser
import urlparse3
# urlparse模块创建绝对路径
# 该函数为一个灵活的下载函数, 该函数能捕获异常、重试下载、并设置用户代理
def download(url, user_agent = 'wswp', num_retries=2):
print("Downloading:", url)
headers = {'User-agent': user_agent}
request = urllib.request.Request(url, headers=headers)
try:
html = urllib.request.urlopen(request).read()
except urllib.request.URLError as e:
print("Download error:", e.reason)
html = None
if num_retries > 0:
if hasattr(e, 'code') and 500 <= e.code < 600:
# recursively retry 5xx HTTP errors
return download(url, num_retries - 1)
return html
# 使用正则表达式进行链接爬虫
import re
# 具备存储已发现URL的功能,可以避免重复下载
def link_crawler(seed_url, link_regex):
"""
Crawl from the given seed URL following links matched by link_regex
:param seed_url:
:param link_regex:
:return:
"""
crawl_queue = [seed_url]
# keep track which URL's have seen before
seen = set(crawl_queue)
while crawl_queue:
url = crawl_queue.pop()
html = download(url)
# filter for links matching our regular expression
for link in get_links(html):
# check if link matches expected regex
if re.match(link_regex, link):
# form absolute link
link = urlparse3.urljoin(seed_url, link)
# check if have already seen this link
if link not in seen:
seen.add(link)
crawl_queue.append(link)
def get_links(html):
"""
Return a list of links from html
:param html:
:return:
"""
# a regular expression to extract all links from thr webpage
webpage_regex = re.compile('<a[^>] + href = ["\'](.*?)["\']]', re.IGNORECASE)
# list of all links from the webpage
return webpage_regex.findall(html)
# 使用python自带的robotparser模块, 可以解析robots.txt文件,以避免下载禁止爬去的URL
if __name__ == "__main__":
download('http://httpstat.us/500')
rp = urllib.robotparser.RobotFileParser()
rp.set_url('http://baidu.com/robots.txt')
print(rp.read())
# print(builtwith.parse('http://example.webscraping.com'))
# print(whois.query("sohu.com"))