# beautifulsoup

## 基本使用

In [None]:
from bs4 import BeautifulSoup

html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""
soup = BeautifulSoup(html,'lxml')
print(soup.prettify())
print(soup.title.string)

## 标签选择器

### 选择元素

In [None]:
print(type(soup.title))
print(soup.title)
print(soup.head)
print(soup.p)
print(soup.a)

### 获取名称

In [None]:
print(soup.title.name)

### 获取属性

In [None]:
print(soup.p.attrs['class'])
print(soup.p['class'])

### 获取内容

In [None]:
print(soup.p.string)

### 嵌套选择

In [None]:
print(soup.head.title.string)

### 子节点和子孙节点

In [None]:
print(soup.body.contents)

In [32]:
print(soup.body.children)
for i,child in enumerate(soup.body.children):
    print(i,child)

<list_iterator object at 0x000000A3D80FEBA8>
0 

1 <p class="title"><b>The Dormouse's story</b></p>
2 

3 <p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
4 

5 <p class="story">...</p>
6 



In [34]:
print(soup.body.descendants)
for i,child in enumerate(soup.body.descendants):
    print(i,child)

<generator object descendants at 0x000000A3D6E8B360>
0 

1 <p class="title"><b>The Dormouse's story</b></p>
2 <b>The Dormouse's story</b>
3 The Dormouse's story
4 

5 <p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
6 Once upon a time there were three little sisters; and their names were

7 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
8 Elsie
9 ,

10 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
11 Lacie
12  and

13 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
14 Tillie
15 ;
and they lived at the bottom of a well.
16 

17 <p class="story">...</p>
18 ...
19 



In [36]:
print(soup.a.parent)

<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>


In [40]:
print(list(enumerate(soup.a.parents)))

[(0, <p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>), (1, <body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body>), (2, <html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their 

In [43]:
print(list(enumerate(soup.a.next_siblings)))
print(list(enumerate(soup.a.previous_siblings)))

[(0, ',\n'), (1, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>), (2, ' and\n'), (3, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>), (4, ';\nand they lived at the bottom of a well.')]
[(0, 'Once upon a time there were three little sisters; and their names were\n')]


## 标准选择器

#### name

In [49]:
print(soup.find_all("a"))
print(type(soup.find_all("a")[0]))

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
<class 'bs4.element.Tag'>


In [59]:
for p in soup.find_all("p"):
    print(p.find_all('a'))

[]
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
[]


#### attrs

In [70]:
print(soup.find_all(attrs={'class':'story'}))

[<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>, <p class="story">...</p>]


In [72]:
print(soup.find_all(class_='story'))

[<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>, <p class="story">...</p>]


#### text

In [73]:
print(soup.find_all(text='Elsie'))

['Elsie']


## css选择器

In [84]:
print(soup.select('.story .sister'))
print(soup.select('.story #link1'))
print(soup.select('p a'))
print(type(soup.select('a')[0]))

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
<class 'bs4.element.Tag'>


In [85]:
for p in soup.select('p'):
    print(p.select('a'))

[]
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
[]


### 获取属性

In [86]:
for a in soup.select('a'):
    print(a['class'])

['sister']
['sister']
['sister']


### 获取内容

In [87]:
for a in soup.select('a'):
    print(a.get_text())

Elsie
Lacie
Tillie
