# urllib库

> python内置的HTTP请求库

|模块名|作用|
|:-|:-|
|urllib.request|请求模块|
|urllib.parse|url解析模块|
|urllib.error|异常处理模块|
|urllib.reboparser|robots.txt解析|

python2与python3的变化

```python2
import urllib2
response = urllib2.urlopen("http://www.baidu.com")
```

```python3
import urllib.request
response = urllib.request.urlopen("http://www.baidu.com")
```

## request

### urlopen
`urllib.request.urlopen(url,data=None,[timeout,]*,cafile=None,capath=None,cadefault=False,context=None)`


In [None]:
import urllib.request

# 发送GET请求
response = urllib.request.urlopen("http://www.baidu.com")
print(response.read().decode("utf-8"))

In [None]:
import urllib.parse
import urllib.request

# data参数需要时bytes类型
data = bytes(urllib.parse.urlencode({'word':'hello'}), encoding='utf-8')
# POST请求
response = urllib.request.urlopen('http://httpbin.org/post', data=data)
print(response.read().decode('utf-8'))

In [None]:
import urllib.request

# 设置超时时间timeout参数
response = urllib.request.urlopen("http://httpbin.org/get", timeout=1)
print(response.read())


In [None]:
import socket
import urllib.request
import urllib.error

# 设置超时时间timeout参数，timeout
try:
    response = urllib.request.urlopen("http://httpbin.org/get", timeout=0.1)
except urllib.error.URLError as e:
    if isinstance(e.reason, socket.timeout):
        print('TIME OUT')

## 响应

### 响应类型

In [None]:
import urllib.request

response = urllib.request.urlopen('http://www.python.org')
print(type(response))

### 响应头、状态码

In [None]:
import urllib.request

response = urllib.request.urlopen('http://www.python.org')
print(response.status)
print(response.getheaders())
print(response.getheader('Server'))

### read获取响应体

In [None]:
import urllib.request

response = urllib.request.urlopen('http://www.pyton.org')
print(response.read().decode('utf-8'))

## Request

In [None]:
import urllib.request

# 构造request，使用urlopen发送请求
request = urllib.request.Request('https://python.org')
response = urllib.request.urlopen(request)
print(response.status)

In [None]:
from urllib import request,parse

url = "http://httpbin.org/post"
headers = {
    'User-Agent':'Mozilla/4.0 (compatlble;MSIE 5.5;Windows NT)',
    'Host':'httpbin.org'
}
dict = {
    'name':'Germey'
}
data = bytes(parse.urlencode(dict), encoding='utf-8')
req = request.Request(url=url, data=data, headers=headers, method='POST')
response = request.urlopen(req)
print(response.read().decode('utf-8'))


In [None]:
from urllib import request,parse

url = "http://httpbin.org/post"
dict = {
    'name':'Germey'
}
data = bytes(parse.urlencode(dict), encoding='utf-8')
req = request.Request(url=url, data=data, method='POST')
req.add_header('User-Agent','Mozilla/4.0 (compatlble;MSIE 5.5;Windows NT)')
response = request.urlopen(req)
print(response.read().decode('utf-8'))

## Handler

### 代理

In [None]:
import urllib.request

proxy_handler = urllib.request.ProxyHandler(
{'http':'http://127.0.0.1:9743',
'https':'https://127.0.0.1:9743'
}
)
opener = urllib.request.build_opener(proxy_handler)
reponse = opener.open('http://httpbin.org/get')
print(resonse.read().decode('utf-8'))

## cookie

In [None]:
import urllib.request
import http.cookiejar

# 读取cookie到变量
cookie = http.cookiejar.CookieJar()
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
print(dir(cookie))
for item in cookie:
    print(item.name+"="+item.value)

In [None]:
import http.cookiejar
import urllib.request

# 保存cookie到本地文件（Mozilla格式）
filename = 'cookie.txt'
cookie = http.cookiejar.MozillaCookieJar(filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
opener = opener.open('http://www.baidu.com')
cookie.save(ignore_discard=True, ignore_expires=True)

In [None]:
import http.cookiejar
import urllib.request

# 保存cookie到本地（LWP格式）
filename = 'LWPCookie.txt'
cookie = http.cookiejar.LWPCookieJar(filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.naodi.com')
print(dir(cookie))
cookie.save(ignore_discard=True, ignore_expires=True)

In [None]:
import http.cookiejar
import urllib.request

# 加载本地Cookie发送请求（Mozilla格式）
filename = 'cookie.txt'
cookie = http.cookiejar.MozillaCookieJar()
cookie.load(filename, ignore_discard=True, ignore_expires=True)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
print(response.read().decode('utf-8'))

## 异常处理

In [None]:
from urllib import request,error
try:
    response = request.urlopen('http://cuiqingcai.com/index.htm')
except error.HTTPError as e:
    print(e.code, e.reason,e.headers, sep='\n')
except error.URLError as e:
    print(e.reason)
else:
    print('Request Successfully')

In [None]:
import socket
import urllib.request
import urllib.error

try:
    response = urllib.request.urlopen('http://www.baidu.com', timeout=0.01)
except urllib.error.URLError as e:
    print(e.reason)
    if isinstance(e.reason, socket.timeout):
        print("TIME OUT")

## URL解析

### urlparse

`urllib.parse.urlparse(urlstring, scheme="",allow_fragments=True)`

In [1]:
from urllib.parse import urlparse

# 将url拆分解析，将其划分为6个结构
# scheme协议类型
# netloc域名
# path路由
# params参数
# query请求参数
# fragment段
result = urlparse('http://www.baidu.com/index.html;user?id=5#comment')
print(type(result), result)

<class 'urllib.parse.ParseResult'> ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=5', fragment='comment')


In [2]:
from urllib.parse import urlparse

result = urlparse('www.baidu.com/index.html;user?id=5#comment', scheme='https')
print(result)

ParseResult(scheme='https', netloc='', path='www.baidu.com/index.html', params='user', query='id=5', fragment='comment')


In [3]:
from urllib.parse import urlparse

result = urlparse('http://www.baidu.com/index.html;user?id=5#comment', scheme='https')
print(result)

ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=5', fragment='comment')


In [7]:
from urllib.parse import urlparse

result = urlparse('http://www.baidu.com/index.html;user?id=5#comment', allow_fragments=False)
print(result)

ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=5#comment', fragment='')


### urlunparse

In [8]:
from urllib.parse import urlunparse

data = ['http', 'www.baidu.com', 'index.html', 'user', 'a=6', 'comment']
print(urlunparse(data))

http://www.baidu.com/index.html;user?a=6#comment


### urljoin

In [16]:
from urllib.parse import urljoin

print(urljoin('http://www.baidu.com', 'FAQ.html'))
# 使用后面的所有字段覆盖前面
print(urljoin('http://www.baidu.com', 'https://cuiqingcai.com/FAQ.html'))
print(urljoin('http://www,baidu.com/about.html', 'https://cuiqingcai.com/FAQ.html?question=2'))
print(urljoin('www.baidu.com','?category=2#comment' ))
print(urljoin('http://www.baidu.com/index.html?a', 'http://www.test.com'))

http://www.baidu.com/FAQ.html
https://cuiqingcai.com/FAQ.html
https://cuiqingcai.com/FAQ.html?question=2
www.baidu.com?category=2#comment
http://www.test.com


### urlencode

In [17]:
from urllib.parse import urlencode

params = {
    'name': 'germey',
    'grade':'6'
}
base_url = 'http://www.baidu.com?'
url = base_url + urlencode(params)
print(url)

http://www.baidu.com?name=germey&grade=6


## urllib.robotparser

In [None]:
import urllib.robotparser
rp = urllib.robotparser.RobotFileParser()
rp.set_url('http://www.musi-cal.com/robots.txt')
rp.read()
rrate = rp.request_rate("*")
rrate.requests
rrate.seconds
rp.crawl_delay('*')
rp.can_fetch('*', 'http://www.musi-cal.com/cgi-bin/search?city=San+Francisco')
rp.can_fetch('*', 'http://www.musi-cal.com/')