In [1]:
from bs4 import BeautifulSoup

In [2]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

soup = BeautifulSoup(html_doc, 'html.parser')

# Kinds of filters

### A string

In [3]:
soup.find_all('b')

[<b>The Dormouse's story</b>]

### A regular expression


In [4]:
import re
for tag in soup.find_all(re.compile("^b")):
    print(tag.name)

body
b


### A list

In [5]:
soup.find_all(["a", "b"])

[<b>The Dormouse's story</b>,
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

### True

In [6]:
for tag in soup.find_all(True):
    print(tag.name)

html
head
title
body
p
b
p
a
a
a
p


### A function

In [9]:
def has_class_but_no_id(tag):
    return tag.has_attr('class') and not tag.has_attr('id')

soup.find_all(has_class_but_no_id)

[<p class="title"><b>The Dormouse's story</b></p>,
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

# `find_all()`
retrieves all descendants that match your filters

In [11]:
soup.find_all("title")

[<title>The Dormouse's story</title>]

In [12]:
soup.find_all("p", "title")

[<p class="title"><b>The Dormouse's story</b></p>]

In [13]:
soup.find_all("a")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [14]:
soup.find_all(id="link2")

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [15]:
soup.find(string=re.compile("sisters"))

'Once upon a time there were three little sisters; and their names were\n'

### The keyword arguments
Any argument that’s not recognized will be turned into a filter on one of a tag’s attributes.

In [16]:
soup.find_all(href=re.compile("elsie"))

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [23]:
soup.find_all(id=True)


[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [24]:
soup.find_all(href=re.compile("elsie"), id='link1')


[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [25]:
# When it is not possible to pass as a keyword, use dictionary instead
soup.find_all(attrs={"class": "story"})

[<p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [26]:
# name is also not possible to be called as a keyword argument. because of .name in BeasutifulSoup
name_soup = BeautifulSoup('<input name="email"/>')
name_soup.find_all(name="email"), name_soup.find_all(attrs={"name": "email"})

([], [<input name="email"/>])

### Searching by CSS class


In [27]:
soup.find_all("a", class_="sister")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [28]:
soup.find_all(class_=re.compile("itl"))

[<p class="title"><b>The Dormouse's story</b></p>]

In [29]:
def has_six_characters(css_class):
    return css_class is not None and len(css_class) == 6

soup.find_all(class_=has_six_characters)

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [30]:
css_soup = BeautifulSoup('<p class="body strikeout"></p>')
css_soup.find_all("p", class_="strikeout"), css_soup.find_all("p", class_="body")

([<p class="body strikeout"></p>], [<p class="body strikeout"></p>])

In [32]:
# search for tags that match two or more CSS classes
css_soup.select("p.strikeout.body"), css_soup.find_all('p', class_=re.compile("(strikeout)|(body)"))

([<p class="body strikeout"></p>], [<p class="body strikeout"></p>])

In [33]:
soup.find_all("a", attrs={"class": "sister"})

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

### the `string` argument

In [34]:
soup.find_all(string="Elsie")

['Elsie']

In [35]:
soup.find_all("a", string="Elsie")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [36]:
# older versions of BeautifulSoup
soup.find_all("a", text="Elsie")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

### the `limit` argument

In [37]:
soup.find_all("a", limit=2)

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

### the `recursive` argument
`recursive=False` is only used when you want to search on children instead of descendants

In [38]:
soup.html.find_all("title"), soup.html.find_all("title", recursive=False)

([<title>The Dormouse's story</title>], [])

## Calling a tag is like calling `find_all()`

In [39]:
soup('a', class_='sister')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

# `find()`

In [40]:
soup.find_all('title', limit=1)

[<title>The Dormouse's story</title>]

In [41]:
soup.find('title')

<title>The Dormouse's story</title>

In [42]:
# Can't find anything -> None
print(soup.find("nosuchtag"))

None


# Other methods:
- `find_parents()` and `find_parent()` 

- `find_next_siblings()` and `find_next_sibling()`

- `find_previous_siblings()` and `find_previous_sibling()`
- `find_all_next()` and `find_next()`
- `find_all_previous()` and `find_previous()`