In [40]:
# 2.2 requests 的使用
# 相对 urllib, 在处理登录验证和 Cookies, POST, GET等方面都进行了优化
import requests
import re
import logging

BAIDU = 'https://www.baidu.com/'
TESTURL = 'https://www.httpbin.org/'
CRAWLURL = 'https://ssr1.scrape.center/'
SSLURL = 'https://ssr2.scrape.center/'
AUTHURL = 'https://ssr3.scrape.center/'

data = {
    'name': 'python',
    'age': 32
}

In [41]:
# 基本操作
r = requests.get(TESTURL + 'get', params = data)
print(type(r))
print(r.status_code)
print(type(r.text))
# print(r.text[:100])
print(r.text)
print(r.cookies)

# requests 返回的是 JSON 格式的字符串, 可以使用 json() 转化为字典数据
print(type(r.json()))
print(r.json())

# 使用返回码常量 requests.codes 去比较, 容易记忆
exit() if not r.status_code == requests.codes.OK else print('Request successfully!!!')

<class 'requests.models.Response'>
200
<class 'str'>
{
  "args": {
    "age": "32", 
    "name": "python"
  }, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Host": "www.httpbin.org", 
    "User-Agent": "python-requests/2.27.1", 
    "X-Amzn-Trace-Id": "Root=1-627e2346-740f28c709891c62242dbe5c"
  }, 
  "origin": "120.235.227.24", 
  "url": "https://www.httpbin.org/get?name=python&age=32"
}

<RequestsCookieJar[]>
<class 'dict'>
{'args': {'age': '32', 'name': 'python'}, 'headers': {'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Host': 'www.httpbin.org', 'User-Agent': 'python-requests/2.27.1', 'X-Amzn-Trace-Id': 'Root=1-627e2346-740f28c709891c62242dbe5c'}, 'origin': '120.235.227.24', 'url': 'https://www.httpbin.org/get?name=python&age=32'}
Request successfully!!!


In [42]:
# 配合正则表达式, 查找内容
r = requests.get(CRAWLURL)
pattern = re.compile('<h2.*?>(.*?)</h2>', re.S)
title = re.findall(pattern, r.text)
print(title)

['霸王别姬 - Farewell My Concubine', '这个杀手不太冷 - Léon', '肖申克的救赎 - The Shawshank Redemption', '泰坦尼克号 - Titanic', '罗马假日 - Roman Holiday', '唐伯虎点秋香 - Flirting Scholar', '乱世佳人 - Gone with the Wind', '喜剧之王 - The King of Comedy', '楚门的世界 - The Truman Show', '狮子王 - The Lion King']


In [43]:
# 二进制文件: 图片、视频等
r = requests.get('https://scrape.center/favicon.ico')
# 乱码
print(r.text)
# b开头表示 bytes 类型, 图片内容为二进制数据, 将数据直接保存到文件, 即得到图片
print(r.content)
# 写入文件, 得到二进制数据文件
with open('./favicon.ico', 'wb') as f:
    f.write(r.content)

           Ё     (       @                         W?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляX@ляZBляZBляZBляZBляZBляZBляZBляZBляZBляZBляZBляZBляZBляZBляZBляZBляZBляZBляZBляZBляZBляZBляX@ляW?ляW?ляW?ляW?ляW?ляW?ляW?ляX@ляR:кяJ1кяK2кяK2кяK2кяK2кяK2кяK2кяK2кяK2кяK2кяK2кяK2кяK2кяK2кяK2кяK2кяK2кяK2кяK2кяK2кяK2кяJ1кяR:кяX@ляW?ляW?ляW?ляW?ляW?ляX@ляR:кяr]ояџ‘фяљЉуя›‹уя›‹уя›‹уя›‹уя›‹уя›‹уя›‹уя›‹уя›‹уя›‹уя›‹уя›‹уя›‹уя›‹уя›‹уя›‹уя›‹уя›‹уя™Љуяџ‘фяr]ояR:кяX@ляW?ляW?ляW?ляW?ляZBляK2кяљЊуяяяяяььяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяяььяяяяяяљЊуяK2кяZBляW?ляW?ляW?ляW?ляZBляK2кя™ЉуяяяяяъщюяььяяььяяььяяььяяььяяььяяььяяььяяььяяььяяььяяььяяььяяььяяььяяььяяььяяььяяъщюяяяяяљЉуяK2кяZBляW?ляW?ляW?ляW?ляZBляK2кяљЊуяяяяяььяяяяяяяяяяююяяюю

In [44]:
# 文件上传
files = {'file': open('./favicon.ico', 'rb')}
r = requests.post(TESTURL + 'post', files = files)
print(r.text)

{
  "args": {}, 
  "data": "", 
  "files": {
    "file": "data:application/octet-stream;base64,AAABAAEAICAAAAEAIACoEAAAFgAAACgAAAAgAAAAQAAAAAEAIAAAAAAAABAAABILAAASCwAAAAAAAAAAAABXP+v/Vz/r/1c/6/9XP+v/Vz/r/1c/6/9XP+v/Vz/r/1c/6/9XP+v/Vz/r/1c/6/9XP+v/Vz/r/1c/6/9XP+v/Vz/r/1c/6/9XP+v/Vz/r/1c/6/9XP+v/Vz/r/1c/6/9XP+v/Vz/r/1c/6/9XP+v/Vz/r/1c/6/9XP+v/Vz/r/1c/6/9XP+v/Vz/r/1c/6/9XP+v/Vz/r/1c/6/9XP+v/Vz/r/1c/6/9XP+v/Vz/r/1c/6/9XP+v/Vz/r/1c/6/9XP+v/Vz/r/1c/6/9XP+v/Vz/r/1c/6/9XP+v/Vz/r/1c/6/9XP+v/Vz/r/1c/6/9XP+v/Vz/r/1c/6/9XP+v/Vz/r/1c/6/9XP+v/Vz/r/1hA6/9aQuv/WkLr/1pC6/9aQuv/WkLr/1pC6/9aQuv/WkLr/1pC6/9aQuv/WkLr/1pC6/9aQuv/WkLr/1pC6/9aQuv/WkLr/1pC6/9aQuv/WkLr/1pC6/9aQuv/WEDr/1c/6/9XP+v/Vz/r/1c/6/9XP+v/Vz/r/1c/6/9YQOv/Ujrq/0ox6v9LMur/SzLq/0sy6v9LMur/SzLq/0sy6v9LMur/SzLq/0sy6v9LMur/SzLq/0sy6v9LMur/SzLq/0sy6v9LMur/SzLq/0sy6v9LMur/SzLq/0ox6v9SOur/WEDr/1c/6/9XP+v/Vz/r/1c/6/9XP+v/WEDr/1I66v9yXe7/n5H0/5qK8/+bi/P/m4vz/5uL8/+bi/P/m4vz/5uL8/+bi/P/m4vz/5uL8/+bi/P/m4vz/5uL8/+bi/P/m4vz/5uL8/+bi/P/m4vz/5uL8/+ZivP/n

In [45]:
# cookie 维持登陆状态
# 方法一
# headers = {
#     'Cookie': 'soomalreferer=http%3A%2F%2Fwww.soomal.com%2Fuser%2F101.login.htm; soomalday=30; soomalid=; soomalmima=8%5B%2Fsh*wN%25PP%3FIkR%5B; soomalname=Fitz',
#     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.4 Safari/605.1.15',
# }
# 方法二, 构建一个 RequestsCookieJar 对象
jar = requests.cookies.RequestsCookieJar()
cookies = 'soomalreferer=http%3A%2F%2Fwww.soomal.com%2Fuser%2F101.login.htm; soomalday=30; soomalid=; soomalmima=8%5B%2Fsh*wN%25PP%3FIkR%5B; soomalname=Fitz'
for c in cookies.split(';'):
    key, value = c.split('=')
    jar.set(key, value)
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.4 Safari/605.1.15',
}
r = requests.get('http://www.soomal.com/bbs/index101000_0001_01.htm', headers = headers, cookies = jar)
print(r.text[:500])

<html>

<head>
<meta http-equiv="Content-Language" content="zh-cn">
<meta http-equiv="Content-Type" content="text/html; charset=gb2312">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1">
<link rel="search" href="../xml/search.xml" title="Soomal" type="application/opensearchdescription+xml">


<link rel="icon" type="image/png" href="../../v5/assets/i/favicon.png">

<!-- Add to homescreen for Chrome on Android -->


In [46]:
# 会话 session 的维持, 每次 requests.Request() 都是一个单独访问, 
# 在不设置相同的 cookie 的情况下, 每次访问的 cookie 不同, 也就是不属于同一次访问
# 不使用 session 和 cookie, 第二次访问无法获得上次的 cookie, 被服务器视为2次访问
r = requests.get(TESTURL + 'cookies/set/number/987654321')
print(r.text)
r = requests.get(TESTURL + 'cookies')
print(r.text)

# 使用 session 后, 同一个session 自动保持同一个 cookie
s = requests.Session()
r = s.get(TESTURL + 'cookies/set/number/987654321')
print(r.text)
r = s.get(TESTURL + 'cookies')
print(r.text)

{
  "cookies": {
    "number": "987654321"
  }
}

{
  "cookies": {}
}

{
  "cookies": {
    "number": "987654321"
  }
}

{
  "cookies": {
    "number": "987654321"
  }
}



In [47]:
# ssl 证书验证
# verify 关闭验证, 访问成功, 但会返回一个 https 的安全警告

# 方法一, 关闭警告
# requests.packages.urllib3.disable_warnings() 

# 方法二, 记录 loggin
logging.captureWarnings(True)
r = requests.get(SSLURL, verify = False)

# 方法三, 指定证书和密钥
# r = requests.get(AUTHURL, cert=('/server.crt', '/server.key'))
print(r.status_code)

200


In [48]:
# 超时设置, 对于服务器响应慢或无响应的情况,  默认 timeout 为 None, 一直等待
r = requests.get(TESTURL + 'get', timeout=1)
print(r.status_code)

# 单独设置连接和读取的时间限制
# 元组第一个元素连接限制: ConnectTimeoutError 
# 第二个元素读取限制: ReadTimeoutError
r = requests.get(TESTURL + 'get', timeout=(1, 1))
print(r.status_code)

200
200


In [49]:
# 身份验证
# r = requests.get(AUTHURL, auth=HTTPBasicAuth('admin', 'admin'))
r = requests.get(AUTHURL, auth=('admin', 'admin'))
print(r.status_code)

200
