# 导入相应的库

In [12]:
import requests
from bs4 import BeautifulSoup
import json

# requests库无参数get示例

In [14]:
text = requests.get("http://httpbin.org/get").text
json.loads(text)

{'args': {},
 'headers': {'Accept': '*/*',
  'Accept-Encoding': 'gzip, deflate',
  'Connection': 'close',
  'Host': 'httpbin.org',
  'User-Agent': 'python-requests/2.18.4'},
 'origin': '14.28.12.23',
 'url': 'http://httpbin.org/get'}

# requests库有参数get示例

In [16]:
payload = {'key1': 'value1', 'key2': 'value2'}
gettext = requests.get("http://httpbin.org/get",params=payload).text
gettext

'{\n  "args": {\n    "key1": "value1", \n    "key2": "value2"\n  }, \n  "headers": {\n    "Accept": "*/*", \n    "Accept-Encoding": "gzip, deflate", \n    "Connection": "close", \n    "Host": "httpbin.org", \n    "User-Agent": "python-requests/2.18.4"\n  }, \n  "origin": "14.28.12.23", \n  "url": "http://httpbin.org/get?key1=value1&key2=value2"\n}\n'

# requests库post请求示例

In [17]:
data = {'a':'123','b':'12345'}
post = requests.post("http://httpbin.org/post",data=data).text
json.loads(post)

{'args': {},
 'data': '',
 'files': {},
 'form': {'a': '123', 'b': '12345'},
 'headers': {'Accept': '*/*',
  'Accept-Encoding': 'gzip, deflate',
  'Connection': 'close',
  'Content-Length': '13',
  'Content-Type': 'application/x-www-form-urlencoded',
  'Host': 'httpbin.org',
  'User-Agent': 'python-requests/2.18.4'},
 'json': None,
 'origin': '14.28.12.23',
 'url': 'http://httpbin.org/post'}

# requests库headers使用示例

In [18]:
headers = {'user-agent':'python is your father','refer':'http://baidu.com'}
headerstext =  requests.get("http://httpbin.org/get",headers=headers).text
json.loads(headerstext)

{'args': {},
 'headers': {'Accept': '*/*',
  'Accept-Encoding': 'gzip, deflate',
  'Connection': 'close',
  'Host': 'httpbin.org',
  'Refer': 'http://baidu.com',
  'User-Agent': 'python is your father'},
 'origin': '14.28.12.23',
 'url': 'http://httpbin.org/get'}

# requests库 timeout使用示例

In [20]:
requests.get('http://google.com')

KeyboardInterrupt: 

# 广州大学图书馆进馆人数爬取

In [22]:
url = 'http://lib.gzhu.edu.cn:8080/bookle/goLibTotal/custom'#图书馆查询进馆人数url
data = {'begin':'2018-05-01','end':'2018-05-06'}#构建post的数据
res = requests.post(url,data=data)#发出请求
soup = BeautifulSoup(res.text,'lxml')#为了下一步提取 构建soup对象

## 使用BeautifulSoup库进行网页数据解析提取

In [23]:
soup.find('div',id='total').text#寻找div标签 并且id参数为total的第一个标签

'总进馆人次:18796'

In [24]:
soup.find_all('td',width='300')#寻找所有td标签 并且width参数为300的标签

[<td width="300">法学院</td>,
 <td width="300">土木工程学院</td>,
 <td width="300">政治与公民教育学院</td>,
 <td width="300">数学与信息科学学院</td>,
 <td width="300">人文学院</td>,
 <td width="300">外国语学院</td>,
 <td width="300">经济与统计学院</td>,
 <td width="300">计算机科学与教育软件学院</td>,
 <td width="300">教育学院</td>,
 <td width="300">物理与电子工程学院</td>,
 <td width="300">生命科学学院</td>,
 <td width="300">机械与电气工程学院</td>,
 <td width="300">化学化工学院</td>,
 <td width="300">环境科学与工程学院</td>,
 <td width="300">公共管理学院</td>,
 <td width="300">地理科学学院</td>,
 <td width="300">工商管理学院</td>,
 <td width="300">旅游学院</td>,
 <td width="300">新闻与传播学院</td>,
 <td width="300">建筑与城市规划学院</td>,
 <td width="300">卫斯理安学院</td>,
 <td width="300">体育学院</td>,
 <td width="300">音乐舞蹈学院</td>,
 <td width="300">美术与设计学院</td>]

# 模拟登陆广州大学图书馆网站

In [25]:
libsession = requests.Session()#创建session对象以保存对话信息
lib_login_url = 'http://202.192.41.8/NTRdrLogin.aspx'

def get_view(response):
    '''获得模拟登陆时提交表单所需的三个参数'''
    soup = BeautifulSoup(response.text, "lxml")
    view = []
    view.append(soup.findAll(name="input")[0]["value"]) 
    view.append(soup.findAll(name="input")[1]["value"])
    view.append(soup.findAll(name="input")[2]["value"]) 
    return view
username = '17195000xxx'#用户名
password = 'xxxx'#密码
login_page = requests.get(lib_login_url)#获得登录页面
view = get_view(login_page)#获得登录页面上的几个必要的登录参数
post_data = {#构建登录post的数据
    '__VIEWSTATE':view[0],
    '__VIEWSTATEGENERATOR':view[1],
    '__EVENTVALIDATION':view[2],
    'txtName':username,
    'txtPassWord':password,
    'Logintype':'RbntRecno',
    'BtnLogin':'%E7%99%BB+%E5%BD%95'
}
res = libsession.post(lib_login_url,data=post_data,timeout = 5)#发起登录请求
res.text

'\r\n\r\n<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\r\n\r\n<html xmlns="http://www.w3.org/1999/xhtml">\r\n<head><title>\r\n\t我的图书馆\r\n</title><link href="CSS/Loginstyle.css" rel="stylesheet" type="text/css" />\r\n    <script src="JS/jquery.js" type="text/javascript"></script>\r\n     <style type="text/css">\r\n        .go\r\n        {\r\n            width: 47px;\r\n            height: 106px;\r\n            position: fixed;\r\n            _position: absolute;\r\n            left:6px;\r\n            _top: expression(eval(document.documentElement.scrollTop+document.documentElement.clientHeight-this.offsetHeight-(parseInt(this.currentStyle.marginTop,10)||200)-(parseInt(this.currentStyle.marginBottom,10)||0)));\r\n            bottom: 40%;\r\n            background-image: url( "/jscss/demoimg/201208/tobg.png" );\r\n            background-repeat: no-repeat;\r\n        }\r\n        .go a\r\n        {\r\n            

# 模拟登陆广州大学教务网站

In [26]:
jw_loginurl = 'https://cas.gzhu.edu.cn/cas_server/login?service=http%3a%2f%2f202.192.18.183%2fLogin_gzdx.aspx'
jwsession = requests.Session()
jwsession.headers['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36'

def get_webflow(response):
    '''获得选课页面必须的lt 以及execution参数'''
    soup = BeautifulSoup(response.text,'lxml')
    lt = soup.find('input',{'name' : 'lt'})['value']
    execution = soup.find('input',{'name' : 'execution'})['value']
    soup.clear()
    return(lt,execution)
password = '112233'
get_lt = jwsession.get(url = jw_loginurl,timeout = 5)
lt, execution = get_webflow(get_lt)
postdata = {
    'username' : username,
    'password' : password,
    'lt' : lt,
    'execution' : execution,
    '_eventId' : 'submit',
    'submit' : '登录'
}
response = jwsession.post(url = jw_loginurl, data = postdata)
response.text

'\r\n<?xml version="1.0" encoding="gb2312" ?>\r\n<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\r\n<HTML class="main_html">\r\n\t<HEAD>\r\n\t\t<title>正方教务管理系统</title><meta http-equiv="X-UA-Compatible" content="IE=EmulateIE7" />\r\n\t\t<meta http-equiv="Content-Type" content="text/html; charset=gb2312">\r\n\t\t<meta http-equiv="Content-Language" content="gb2312">\r\n\t\t<meta content="all" name="robots">\r\n\t\t<meta name="author" content="作者信息">\r\n\t\t<meta name="Copyright" content="版权信息">\r\n\t\t<meta name="description" content="站点介绍">\r\n\t\t<meta name="keywords" content="站点关键词">\r\n\t\t<link rel="stylesheet" href="style/base/jw.css" type="text/css" media="all">\r\n\t\t\t<link rel="stylesheet" href="style/standard/jw.css" type="text/css" media="all">\r\n\t\t\t\t\r\n\t\t\t\t\t<script language="javascript" src="style/js/iframeautoheight.js"></script>\r\n\t\t\t\t<!--[if IE 6]> \r\n<script src="style/js/ie6comm.j

# 爬取教务网站上的个人信息

In [27]:
def get_stuinfo(response):
    '''通过解析学生个人信息页面获得学生基本信息'''
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')
    d = {}
    d["studentnumber"] = soup.find(id="xh").string
    d["idCardNumber"] = soup.find(id="lbl_sfzh").string
    d["name"] = soup.find(id="xm").string
    d["sex"] = soup.find(id="lbl_xb").string
    d["enterSchoolTime"] = soup.find(id="lbl_rxrq").string
    d["birthsday"] = soup.find(id="lbl_csrq").string
    d["highschool"] = soup.find(id="lbl_byzx").string
    d["nationality"] = soup.find(id="lbl_mz").string
    d["hometown"] = soup.find(id="lbl_jg").string
    d["politicsStatus"] = soup.find(id="lbl_zzmm").string
    d["college"] = soup.find(id="lbl_xy").string
    d["major"] = soup.find(id="lbl_zymc").string
    d["classname"] = soup.find(id="lbl_xzb").string
    d["gradeClass"] = soup.find(id="lbl_dqszj").string
    return d

baseUrl = 'http://202.192.18.184'
infourl = "http://202.192.18.184/xsgrxx.aspx?xh="+username+"&"#查询个人信息的网址构造
jwsession.get(baseUrl)
res = jwsession.get(infourl)
info = get_stuinfo(res)
info

{'birthsday': '19980514',
 'classname': '物联171',
 'college': '物理与电子工程学院',
 'enterSchoolTime': '20170901',
 'gradeClass': '2017',
 'highschool': '珠海市斗门区第一中学',
 'hometown': '湛江遂溪',
 'idCardNumber': '440823199805140037',
 'major': '物联网工程',
 'name': '詹逸',
 'nationality': '汉族',
 'politicsStatus': '共青团员',
 'sex': '男',
 'studentnumber': '1719500024'}