# 网络爬取的提取

## BeautifulSoup

In [2]:
import requests
from bs4 import BeautifulSoup
url = 'https://python123.io/ws/demo.html'
demo = requests.get(url)
demo.status_code

soup = BeautifulSoup(demo.text, 'html.parser')
soup.text.strip()

'This is a python demo page\n\nThe demo python introduces several python courses.\nPython is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:\r\nBasic Python and Advanced Python.'

## BeautifulSoup基本元素
### BeautifulSoup库是解析 遍历 维护 标签树的功能库

### 标签的格式

In [3]:
# <p>..<p>：标签 Tag
# <p class='title'>..</p>
# <p: Name 成对出现
# class='title' 属性 Attributes 0个或多个

### BeautifulSoup库的引用
### from bs4 import BeautifulSoup
### import bs4

In [4]:
# BeautifulSoup类 标签树 HTML BeautifulSoup类
# 例如
# soup = BeautifulSoup('<html>data</html>', 'html.parser')
# soup2 = BeautifulSoup(open('D://demo.html'), 'html.parser')

### BeautifulSoup类 对应一个HTML XML文档的内容

### BeautifulSoup库解析器
#### 'html.parser' 安装bs4
#### 'lxml'  pip install lxml
#### 'xml' pip install lxml
#### 'html5lib' pip install html5lib

### BeautifulSoup类的基本元素

In [5]:
#### Tag 标签 最基本的信息组织单元 分别用 <> 和</>表明开头和结尾
#### Name 标签的名字  尖括号里面的字母 如<p> 格式：<tag>.name
#### Atrributes 标签的属性 字典形式组织 格式：<tag>.attrs
#### NavigableString 标签内非属性字符串 <>.....<>中的字符串 格式：<tag>.string
#### comment 标签内字符串的注释部分

In [6]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(demo.text, 'html.parser')

In [7]:
# 获取页面标题 标签title
soup.title

<title>This is a python demo page</title>

In [8]:
# 获取a标签
soup.a

<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a>

In [9]:
# 获取a标签的名字
soup.a.name

'a'

In [10]:
# 获取a标签的上一层标签 父亲标签
soup.a.parent.name

'p'

In [11]:
soup.p.parent.name

'body'

In [12]:
# 获取标签的属性
soup.a.attrs

{'class': ['py1'],
 'href': 'http://www.icourse163.org/course/BIT-268001',
 'id': 'link1'}

In [13]:
soup.a.attrs['class']

['py1']

In [14]:
soup.a.attrs['href']

'http://www.icourse163.org/course/BIT-268001'

In [15]:
type(soup.a.attrs)

dict

In [16]:
type(soup.a)

bs4.element.Tag

In [17]:
# 获取NavigableString
soup.a.string

'Basic Python'

In [18]:
soup.p.string

'The demo python introduces several python courses.'

In [19]:
type(soup.p.string)

bs4.element.NavigableString

In [20]:
# 处理注释
newsoup = BeautifulSoup('<b><!--this is a comment--></b><p>this is not a comment</p>', 'html.parser')

In [21]:
newsoup.b.string

'this is a comment'

In [22]:
newsoup.p.string

'this is not a comment'

In [23]:
type(newsoup.b.string)

bs4.element.Comment

In [24]:
type(newsoup.p.string)

bs4.element.NavigableString

## 基于bs4库的HTML内容遍历方法
### 上行遍历 下行遍历 平行遍历

### 标签树的下行遍历
#### .contents 子节点列表 将所有儿子节点存入列表
#### .children 子节点的迭代类型 用于遍历儿子节点
#### .descendants 子孙节点的迭代类型 包含所有子孙节点 用于循环遍历

In [25]:
soup = BeautifulSoup(demo.text, 'html.parser')

In [26]:
soup.head

<head><title>This is a python demo page</title></head>

In [27]:
soup.head.contents

[<title>This is a python demo page</title>]

In [28]:
soup.body.contents

['\n',
 <p class="title"><b>The demo python introduces several python courses.</b></p>,
 '\n',
 <p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:
 <a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a> and <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>.</p>,
 '\n']

In [29]:
len(soup.body.contents)

5

In [30]:
# 遍历儿子节点
for child in soup.body.children:
    print(child)



<p class="title"><b>The demo python introduces several python courses.</b></p>


<p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:
<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a> and <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>.</p>




In [31]:
# 遍历子孙节点
for descendant in soup.body.descendants:
    print(descendant)



<p class="title"><b>The demo python introduces several python courses.</b></p>
<b>The demo python introduces several python courses.</b>
The demo python introduces several python courses.


<p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:
<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a> and <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>.</p>
Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:

<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a>
Basic Python
 and 
<a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>
Advanced Python
.




### 标签数的上行遍历
#### .parent 节点的父亲标签
#### .parents 节点先辈标签的迭代类型 用于循环先辈节点

In [32]:
soup.title.parent

<head><title>This is a python demo page</title></head>

In [33]:
soup.html.parent

<html><head><title>This is a python demo page</title></head>
<body>
<p class="title"><b>The demo python introduces several python courses.</b></p>
<p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:
<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a> and <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>.</p>
</body></html>

In [34]:
soup.parent

In [35]:
for parent in soup.a.parents:
    if parent is None:
        print(parent)
    else:
        print(parent.name)

p
body
html
[document]


### 标签树的平行遍历
#### .next_sibling 返回按照HTML文本顺序的下一个平行节点标签
#### .previous_sibling 返回按照HTML文本顺序的上一个平行节点标签
#### .next_siblings 迭代类型
#### .previous_siblings 迭代类型
### 平行遍历发生在同一个父节点下的各节点间

In [36]:
soup.a.next_sibling

' and '

In [37]:
soup.a.next_sibling.next_sibling

<a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>

In [38]:
soup.a.previous_sibling

'Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:\r\n'

In [39]:
soup.a.previous_sibling.previous_sibling

In [40]:
# 遍历后续节点
for sibling in soup.a.next_siblings:
    print(sibling)

 and 
<a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>
.


In [41]:
# 遍历前续节点
for sibling in soup.a.previous_siblings:
    print(sibling)

Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:



## 基于bs4库的HTML格式化和编码
### prettify（）方法

In [42]:
print(soup.prettify())

<html>
 <head>
  <title>
   This is a python demo page
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The demo python introduces several python courses.
   </b>
  </p>
  <p class="course">
   Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:
   <a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">
    Basic Python
   </a>
   and
   <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">
    Advanced Python
   </a>
   .
  </p>
 </body>
</html>


## 信息标记的三种形式

### 优点：
#### 标记后的的信息可形成信息组织结构 增加了信息维度
#### 标记后的信息可用于通信 存储或展示
#### 标记的结构与信息一样具有重要价值
#### 标记后的信息更利于程序理解和运用

### HTML
#### HTML：hyper text markup language 
#### 超文本信息标记语言 声音 图像 视频
#### 是www的信息组织方式
#### 通过预定义 以标签形式处理不同类型的信息

### XML
#### eXtensible Markup Language

### JSON 有类型的键值对 ‘key’：‘value’
#### Javascript Object Notation

### YAML ：无类型键值对  key：value

## 信息标记三种形式的比较
### XML 最早的通用信息标记语言 可扩展性好 但繁琐
#### Internet信息交互表达
### JSON 信息有类型 适合程序处理 较XML简洁
#### 移动运用云端和节点的信息通信 无注释
### YAML 信息无类型 文本信息比例最高 可读性好
#### 各类系统的配置文件 有注释易读

## 信息提取的一般方法
### 方法一：完整解析信息的标记形式 再提取关键信息
#### 例如：bs4库的标签树遍历
#### 优点：信息解析准确
#### 缺点：提取过程繁琐 速度慢
### 方法二：无视标记形式 直接搜索关键信息
#### 对信息的文本查找函数
#### 优点：提取过程简洁 速度较快
#### 缺点：提取结果准确性与信息内容相关
### 方法三：融合方法 结合形式解析与搜索方法 提取关键信息
#### 需要标记解析器及文本查找函数

In [43]:
# 实例
for link in soup.find_all('a'):
    print(link.get('href'))

http://www.icourse163.org/course/BIT-268001
http://www.icourse163.org/course/BIT-1001870001


## 基于bs4库的HTML内容查找方法

In [48]:
# .find_all(name, attrs, recursive, string, **kwargs)
# name:对标签名称的检索字符串
soup.find_all('a')

[<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a>,
 <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>]

In [49]:
soup.find_all(['a', 'b'])

[<b>The demo python introduces several python courses.</b>,
 <a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a>,
 <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>]

In [52]:
for tag in soup.find_all(True):
    print(tag.name)

html
head
title
body
p
b
p
a
a


In [53]:
import re
for tag in soup.find_all(re.compile('b')):
    print(tag.name)

body
b


In [54]:
# attrs 对标签属性值的检索字符串 可标注属性检索
soup.find_all('p', 'course')

[<p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:
 <a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a> and <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>.</p>]

In [55]:
soup.find_all(id='link1')

[<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a>]

In [56]:
soup.find_all(id='link')

[]

In [57]:
soup.find_all(id=re.compile('link'))

[<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a>,
 <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>]

In [58]:
# recursive 是否对子孙全部检索 默认为True
soup.find_all('a')

[<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a>,
 <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>]

In [59]:
soup.find_all('a', recursive=False)

[]

In [60]:
# string <>...</>中的字符串区域的检索字符串
soup

<html><head><title>This is a python demo page</title></head>
<body>
<p class="title"><b>The demo python introduces several python courses.</b></p>
<p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:
<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a> and <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>.</p>
</body></html>

In [61]:
soup.find_all(string='Basic Python')

['Basic Python']

In [62]:
soup.find_all(string=re.compile('Python'))

['Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:\r\n',
 'Basic Python',
 'Advanced Python']

### 扩展方法

In [None]:
# .find() 搜索且只返回一个结果 字符串类型 同find_all()参数
# .find_parents 在先辈节点中搜索 返回列表类型 同.find_all()参数
# .find_parent 在先辈节点中搜索返回一个结果 字符串类型 同.find()参数
# .find_next_siblings() 在后续平行节点中搜索 返回列表类型 同.find_all()参数
# .find_next_sibling() 在后续平行节点中搜索 返回一个结果 字符串类型 同.find()参数
# .find_previous_siblings() 在前序平行节点中搜索 返回列表类型 同.find_all()参数
# .find_previous_sibling()  在前序平行节点中搜索 返回一个结果 同.find()参数