In [1]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

In [2]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')

In [3]:
print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


In [4]:
# Here are some simple ways to navigate that data structure:

#to print the whole tag
print(soup.title)

#to print the name of the tag
print(soup.title.name)

#to print text
print(soup.title.string)
print(soup.title.text)

#to access the parent tag
print(soup.title.parent)
print(soup.title.parent.name)

#accessing paragraph tag
print(soup.p)

#to access attributes
print(soup.p['class'])

#to access tag which are present multiple times
print(soup.a)

#to access all 'a' tags
print(soup.find_all('a'))

#to access tag on the basis of attributes
print(soup.find(id="link3"))

<title>The Dormouse's story</title>
title
The Dormouse's story
The Dormouse's story
<head><title>The Dormouse's story</title></head>
head
<p class="title"><b>The Dormouse's story</b></p>
['title']
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>


In [5]:
# print(soup.find(class="story"))#since class is an existing python keyword so this line will generate error
# solution for class attributes
print(soup.find(class_="story"))

<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>


In [6]:
#extracting all the URLs found in a tags
for link in soup.find_all('a'):
    print(link.get('href'))

http://example.com/elsie
http://example.com/lacie
http://example.com/tillie


In [7]:
# how to extract all the text from the page
print(soup.get_text())


The Dormouse's story

The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...



### If you get the ImportError “No module named HTMLParser”, your problem is that you’re running the Python 2 version of the code under Python 3.
### If you get the ImportError “No module named html.parser”, your problem is that you’re running the Python 3 version of the code under Python 2.

In [8]:
from urllib.request import urlopen
url = "https://www.crummy.com/software/BeautifulSoup/bs4/doc/"
uClient = urlopen(url)
html = uClient.read()
uClient.close()
page_soup = BeautifulSoup(html, 'lxml')
table = page_soup.find("table",{"class":"docutils"})
print(table)

<table border="1" class="docutils">
<colgroup>
<col width="18%"></col>
<col width="35%"></col>
<col width="26%"></col>
<col width="21%"></col>
</colgroup>
<tbody valign="top">
<tr class="row-odd"><td>Parser</td>
<td>Typical usage</td>
<td>Advantages</td>
<td>Disadvantages</td>
</tr>
<tr class="row-even"><td>Python’s html.parser</td>
<td><code class="docutils literal"><span class="pre">BeautifulSoup(markup,</span> <span class="pre">"html.parser")</span></code></td>
<td><ul class="first last simple">
<li>Batteries included</li>
<li>Decent speed</li>
<li>Lenient (as of Python 2.7.3
and 3.2.)</li>
</ul>
</td>
<td><ul class="first last simple">
<li>Not very lenient
(before Python 2.7.3
or 3.2.2)</li>
</ul>
</td>
</tr>
<tr class="row-odd"><td>lxml’s HTML parser</td>
<td><code class="docutils literal"><span class="pre">BeautifulSoup(markup,</span> <span class="pre">"lxml")</span></code></td>
<td><ul class="first last simple">
<li>Very fast</li>
<li>Lenient</li>
</ul>
</td>
<td><ul class="firs

#### This table summarizes the advantages and disadvantages of each parser library:
<table border="1" class="docutils">
<colgroup>
<col width="18%"></col>
<col width="35%"></col>
<col width="26%"></col>
<col width="21%"></col>
</colgroup>
<tbody valign="top">
<tr class="row-odd"><td>Parser</td>
<td>Typical usage</td>
<td>Advantages</td>
<td>Disadvantages</td>
</tr>
<tr class="row-even"><td>Python’s html.parser</td>
<td><code class="docutils literal"><span class="pre">BeautifulSoup(markup,</span> <span class="pre">"html.parser")</span></code></td>
<td><ul class="first last simple">
<li>Batteries included</li>
<li>Decent speed</li>
<li>Lenient (as of Python 2.7.3
and 3.2.)</li>
</ul>
</td>
<td><ul class="first last simple">
<li>Not very lenient
(before Python 2.7.3
or 3.2.2)</li>
</ul>
</td>
</tr>
<tr class="row-odd"><td>lxml’s HTML parser</td>
<td><code class="docutils literal"><span class="pre">BeautifulSoup(markup,</span> <span class="pre">"lxml")</span></code></td>
<td><ul class="first last simple">
<li>Very fast</li>
<li>Lenient</li>
</ul>
</td>
<td><ul class="first last simple">
<li>External C dependency</li>
</ul>
</td>
</tr>
<tr class="row-even"><td>lxml’s XML parser</td>
<td><code class="docutils literal"><span class="pre">BeautifulSoup(markup,</span> <span class="pre">"lxml-xml")</span></code>
<code class="docutils literal"><span class="pre">BeautifulSoup(markup,</span> <span class="pre">"xml")</span></code></td>
<td><ul class="first last simple">
<li>Very fast</li>
<li>The only currently supported
XML parser</li>
</ul>
</td>
<td><ul class="first last simple">
<li>External C dependency</li>
</ul>
</td>
</tr>
<tr class="row-odd"><td>html5lib</td>
<td><code class="docutils literal"><span class="pre">BeautifulSoup(markup,</span> <span class="pre">"html5lib")</span></code></td>
<td><ul class="first last simple">
<li>Extremely lenient</li>
<li>Parses pages the same way a
web browser does</li>
<li>Creates valid HTML5</li>
</ul>
</td>
<td><ul class="first last simple">
<li>Very slow</li>
<li>External Python
dependency</li>
</ul>
</td>
</tr>
</tbody>
</table>

### Beautiful Soup transforms a complex HTML document into a complex tree of Python objects. But you’ll only ever have to deal with about four kinds of objects: Tag, NavigableString, BeautifulSoup, and Comment.

In [9]:
soup = BeautifulSoup('<b id="boldest">Extremely bold</b>','lxml')
tag = soup.b
type(tag)

bs4.element.Tag

In [10]:
# Every tag has a name, accessible as .name
print(tag.name)

b


In [11]:
# If you change a tag’s name, the change will be reflected in any HTML markup generated by Beautiful Soup:
tag.name = "blockquote"
tag

<blockquote id="boldest">Extremely bold</blockquote>

In [12]:
# A tag may have any number of attributes. The tag <b id="boldest"> has an attribute “id” whose value is “boldest”. 
# You can access a tag’s attributes by treating the tag like a dictionary
print(tag['id'])

#to access the whole dictionary
print(tag.attrs)

boldest
{'id': 'boldest'}


In [13]:
# You can add, remove, and modify a tag’s attributes. Again, this is done by treating the tag as a dictionary:
tag['id'] = 'verybold'
tag['another-attribute'] = 1
tag

<blockquote another-attribute="1" id="verybold">Extremely bold</blockquote>

In [14]:
del tag['id']
del tag['another-attribute']
tag

<blockquote>Extremely bold</blockquote>

In [16]:
tag['id']

KeyError: 'id'

In [None]:
print(tag.get('id'))

In [None]:
# HTML 4 defines a few attributes that can have multiple values. HTML 5 removes a couple of them, but defines a few more.
# The most common multi-valued attribute is class (that is, a tag can have more than one CSS class).
css_soup = BeautifulSoup('<p class="body strikeout"></p>')
css_soup.p['class']

In [17]:
# if an attribute looks like it has more than one value, but it’s not a multi-valued attribute as defined by any version 
# of the HTML standard, Beautiful Soup will leave the attribute alone
id_soup = BeautifulSoup('<p id="my id"></p>')
id_soup.p['id']



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


'my id'

In [18]:
# When you turn a tag back into a string, multiple attribute values are consolidated:
rel_soup = BeautifulSoup('<p>Back to the <a rel="index">homepage</a></p>')
print(rel_soup.a['rel'])
print('\n')
rel_soup.a['rel'] = ['index', 'contents']
print(rel_soup.p)

['index']


<p>Back to the <a rel="index contents">homepage</a></p>


In [19]:
# If you parse a document as XML, there are no multi-valued attributes:
xml_soup = BeautifulSoup('<p class="body strikeout"></p>', 'xml')
xml_soup.p['class']

'body strikeout'

In [20]:
# A string corresponds to a bit of text within a tag. Beautiful Soup uses the NavigableString class to contain these 
# bits of text:
print(tag.string)
print(type(tag.string))

Extremely bold
<class 'bs4.element.NavigableString'>


In [21]:
# You can’t edit a string in place, but you can replace one string with another, using replace_with():
tag.string.replace_with("No longer bold")
tag

<blockquote>No longer bold</blockquote>

#### A NavigableString is just like a Python Unicode string, except that it also supports some of the features described in Navigating the tree and Searching the tree.
#### The BeautifulSoup object itself represents the document as a whole. For most purposes, you can treat it as a Tag object. This means it supports most of the methods described in Navigating the tree and Searching the tree.
#### Since the BeautifulSoup object doesn’t correspond to an actual HTML or XML tag, it has no name and no attributes. But sometimes it’s useful to look at its .name, so it’s been given the special .name “[document]”:

In [22]:
soup.name

'[document]'

#### Tag, NavigableString, and BeautifulSoup cover almost everything you’ll see in an HTML or XML file, but there are a few leftover bits. The only one you’ll probably ever need to worry about is the comment:

In [23]:
markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>"
soup = BeautifulSoup(markup)
comment = soup.b.string
type(comment)

bs4.element.Comment

In [24]:
# The Comment object is just a special type of NavigableString:
comment

'Hey, buddy. Want to buy a used parser?'

### Navigating the tree

In [25]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')

#### Going Down

In [26]:
# The simplest way to navigate the parse tree is to say the name of the tag you want. If you want the <head> tag, just say soup.head:
print(soup.head)
print(soup.title)

<head><title>The Dormouse's story</title></head>
<title>The Dormouse's story</title>


In [27]:
print(soup.body.b)

<b>The Dormouse's story</b>


In [28]:
soup.find('a')

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [29]:
# .contents and .children
# A tag’s children are available in a list called .contents:
head_tag = soup.head
head_tag

<head><title>The Dormouse's story</title></head>

In [30]:
head_tag.contents

[<title>The Dormouse's story</title>]

In [31]:
title_tag = head_tag.contents[0]
title_tag

<title>The Dormouse's story</title>

In [32]:
title_tag.contents

["The Dormouse's story"]

In [33]:
len(soup.contents)

2

In [34]:
soup.contents[0].name

In [36]:
# A string does not have .contents, because it can’t contain anything:
text = title_tag.contents[0]
text.contents

AttributeError: 'NavigableString' object has no attribute 'contents'

In [None]:
# .descendants
# The .contents and .children attributes only consider a tag’s direct children. For instance, the <head> tag has a single
# direct child–the <title> tag
head_tag.contents

In [37]:
for child in head_tag.descendants:
    print(child)

<title>The Dormouse's story</title>
The Dormouse's story


In [38]:
print(len(list(soup.children)),len(list(soup.descendants)))

2 27


In [39]:
# string
# If a tag has only one child, and that child is a NavigableString, the child is made available as .string:
title_tag.string

"The Dormouse's story"

In [40]:
# If a tag’s only child is another tag, and that tag has a .string, then the parent tag is considered to have the same .
# string as its child:
head_tag.string

"The Dormouse's story"

In [41]:
# If a tag contains more than one thing, then it’s not clear what .string should refer to, so .string is defined to be None:
print(soup.html.string)

None


In [42]:
# .strings and stripped_strings
# If there’s more than one thing inside a tag, you can still look at just the strings. Use the .strings generator:
for string in soup.strings:
    print(repr(string))

'\n'
"The Dormouse's story"
'\n'
'\n'
"The Dormouse's story"
'\n'
'Once upon a time there were three little sisters; and their names were\n'
'Elsie'
',\n'
'Lacie'
' and\n'
'Tillie'
';\nand they lived at the bottom of a well.'
'\n'
'...'
'\n'


In [43]:
# These strings tend to have a lot of extra whitespace, which you can remove by using the .stripped_strings generator instead:
for string in soup.stripped_strings:
    print(repr(string))

"The Dormouse's story"
"The Dormouse's story"
'Once upon a time there were three little sisters; and their names were'
'Elsie'
','
'Lacie'
'and'
'Tillie'
';\nand they lived at the bottom of a well.'
'...'


#### Going up

In [44]:
# You can access an element’s parent with the .parent attribute. In the example “three sisters” document, the <head> tag is the parent of the <title> tag:
title_tag = soup.title
title_tag

<title>The Dormouse's story</title>

In [45]:
title_tag.parent

<head><title>The Dormouse's story</title></head>

In [46]:
title_tag.string.parent

<title>The Dormouse's story</title>

In [47]:
html_tag = soup.html
type(html_tag.parent)

bs4.BeautifulSoup

In [48]:
print(soup.parent)

None


In [49]:
# You can iterate over all of an element’s parents with .parents. This example uses .parents to travel from an <a> tag buried deep within the document, to the very top of the document:
link = soup.a
link

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [50]:
for parent in link.parents:
    if parent is None:
        print(parent)
    else:
        print(parent.name)

p
body
html
[document]


#### Going sideways

In [51]:
sibling_soup = BeautifulSoup("<a><b>text1</b><c>text2</c></b></a>")
print(sibling_soup.prettify())

<html>
 <body>
  <a>
   <b>
    text1
   </b>
   <c>
    text2
   </c>
  </a>
 </body>
</html>


In [52]:
# .next_sibling and .previous_sibling
sibling_soup.b.next_sibling

<c>text2</c>

In [53]:
sibling_soup.c.previous_sibling

<b>text1</b>

In [54]:
print(sibling_soup.b.previous_sibling)
print(sibling_soup.c.next_sibling)

None
None


In [55]:
# In real documents, the .next_sibling or .previous_sibling of a tag will usually be a string containing whitespace.
link = soup.a
link.next_sibling

',\n'

In [56]:
link.next_sibling.next_sibling

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>

In [57]:
for sibling in soup.a.next_siblings:
    print(repr(sibling))

',\n'
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
' and\n'
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
';\nand they lived at the bottom of a well.'


In [58]:
for sibling in soup.find(id="link3").previous_siblings:
    print(repr(sibling))

' and\n'
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
',\n'
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
'Once upon a time there were three little sisters; and their names were\n'


In [59]:
# Going back and forth
last_a_tag = soup.find("a", id="link3")
last_a_tag.next_sibling

';\nand they lived at the bottom of a well.'

#### Going back and forth

In [89]:
last_a_tag.next_sibling

';\nand they lived at the bottom of a well.'

In [90]:
for i in last_a_tag.next_siblings:
    print(i)

;
and they lived at the bottom of a well.


In [91]:
last_a_tag.next_element

'Tillie'

In [92]:
for element in last_a_tag.next_elements:
    print(repr(element))

'Tillie'
';\nand they lived at the bottom of a well.'
'\n'
<p class="story">...</p>
'...'
'\n'


In [95]:
last_a_tag.previous_element

' and\n'

In [96]:
last_a_tag.previous_element.next_element

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

### Searching the tree

In [97]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')

#### kind of filters

In [98]:
# 1) A string
soup.find_all('b')

[<b>The Dormouse's story</b>]

In [100]:
# 2) A Regular expression
import re
for tag in soup.find_all(re.compile("^b")):
    print(tag.name)
    
# for atgs containing 't'
for tag in soup.find_all(re.compile("t")):
    print(tag.name)

body
b
html
title


In [101]:
# 3) A lsit
soup.find_all(["a", "b"])

[<b>The Dormouse's story</b>,
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [107]:
# 4) True
for tag in soup.find_all(True):
    print(tag.name)

html
head
title
body
p
b
p
a
a
a
p


In [119]:
# 5) A function
# If none of the other matches work for you, define a function that takes an element as its only argument. 
# The function should return True if the argument matches, and False otherwise.
def has_class_but_no_id(tag):
    return (tag.has_attr('class') and not tag.has_attr('id'))

soup.findAll(has_class_but_no_id)

[<p class="title"><b>The Dormouse's story</b></p>,
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [115]:
def not_lacie(href):
    return href and not re.compile("lacie").search(href)
soup.find_all(href=not_lacie)

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [116]:
from bs4 import NavigableString
def surrounded_by_strings(tag):
    return (isinstance(tag.next_element, NavigableString)
            and isinstance(tag.previous_element, NavigableString))

for tag in soup.find_all(surrounded_by_strings):
    print(tag.name)

body
p
a
a
a
p


In [117]:
# Signature: find_all(name, attrs, recursive, string, limit, **kwargs)
# The find_all() method looks through a tag’s descendants and retrieves all descendants that match your filters.

print(soup.find_all("title"))

# find all the 'p' tag with class = "title"
print(soup.find_all("p", "title"))
print(soup.find_all("a"))
print(soup.find_all(id="link2"))
import re
soup.find(string=re.compile("sisters"))

[<title>The Dormouse's story</title>]
[<p class="title"><b>The Dormouse's story</b></p>]
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]


'Once upon a time there were three little sisters; and their names were\n'

In [120]:
# The name argument
soup.find_all("title")

[<title>The Dormouse's story</title>]

In [121]:
# The keyword arguments
# If you pass in a value for an argument called id, Beautiful Soup will filter against each tag’s ‘id’ attribute:
soup.find_all(id='link2')

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [122]:
# If you pass in a value for href, Beautiful Soup will filter against each tag’s ‘href’ attribute:
soup.find_all(href=re.compile("elsie"))

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [123]:
# You can filter an attribute based on a string, a regular expression, a list, a function, or the value True.
soup.find_all(id=True)

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [124]:
# You can filter multiple attributes at once by passing in more than one keyword argument:
soup.find_all(href=re.compile("elsie"), id='link1')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [131]:
# Some attributes, like the data-* attributes in HTML 5, have names that can’t be used as the names of keyword arguments:
data_soup = BeautifulSoup('<div data-foo="value">foo!</div>','html.parser')
# data_soup.find_all(data-foo="value")

In [132]:
data_soup.find_all(attrs={"data-foo": "value"})

[<div data-foo="value">foo!</div>]

In [133]:
name_soup = BeautifulSoup('<input name="email"/>')
name_soup.find_all(name="email")

[]

In [134]:
name_soup.find_all(attrs={"name": "email"})

[<input name="email"/>]

#### Searching by CSS class

In [135]:
# we can't use class in css because class is a reaserved keyword
soup.find_all("a", class_="sister")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [136]:
soup.find_all(class_=re.compile("itl"))

[<p class="title"><b>The Dormouse's story</b></p>]

In [137]:
def has_six_characters(css_class):
    return css_class is not None and len(css_class) == 6

soup.find_all(class_=has_six_characters)

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [138]:
css_soup = BeautifulSoup('<p class="body strikeout"></p>')
print(css_soup.find_all("p", class_="strikeout"))
print(css_soup.find_all("p", class_="body"))
print(css_soup.find_all("p", class_="body strikeout"))

[<p class="body strikeout"></p>]
[<p class="body strikeout"></p>]
[<p class="body strikeout"></p>]


In [139]:
print(css_soup.find_all("p", class_="strikeout body"))

[]


In [140]:
# If you want to search for tags that match two or more CSS classes, you should use a CSS selector:
css_soup.select("p.strikeout.body")

[<p class="body strikeout"></p>]

In [141]:
# In older versions of Beautiful Soup, which don’t have the class_ shortcut, you can use the attrs trick mentioned above
soup.find_all("a", attrs={"class": "sister"})

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [142]:
# The string argument

# With string you can search for strings instead of tags. As with name and the keyword arguments, you can pass in a 
# string, a regular expression, a list, a function, or the value True
print(soup.find_all(string="Elsie"))
print(soup.find_all(string=["Tillie", "Elsie", "Lacie"]))
print(soup.find_all(string=re.compile("Dormouse")))


['Elsie']
['Elsie', 'Lacie', 'Tillie']
["The Dormouse's story", "The Dormouse's story"]


In [143]:
def is_the_only_string_within_a_tag(s):
    """Return True if this string is the only child of its parent tag."""
    return (s == s.parent.string)

soup.find_all(string=is_the_only_string_within_a_tag)

["The Dormouse's story",
 "The Dormouse's story",
 'Elsie',
 'Lacie',
 'Tillie',
 '...']

In [144]:
soup.find_all("a", string="Elsie")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [145]:
soup.find_all("a", text="Elsie")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [146]:
# The limit argument

# find_all() returns all the tags and strings that match your filters. This can take a while if the document is large. 
# If you don’t need all the results, you can pass in a number for limit. This works just like the LIMIT keyword in SQL. 
# It tells Beautiful Soup to stop gathering results after it’s found a certain number.
print(soup.find_all("a", limit=2))

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]


In [147]:
# The recursive argument
print(soup.html.find_all("title"))
print(soup.html.find_all("title", recursive=False))

[<title>The Dormouse's story</title>]
[]


In [148]:
# Calling a tag is like calling find_all()¶
soup.find_all("a") 
# soup("a")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [149]:
soup.title.find_all(string=True)
soup.title(string=True)

["The Dormouse's story"]

#### Signature: find(name, attrs, recursive, string, **kwargs)

In [150]:
print(soup.find_all('title', limit=1))
print(soup.find('title'))

[<title>The Dormouse's story</title>]
<title>The Dormouse's story</title>


In [151]:
# If find_all() can’t find anything, it returns an empty list. If find() can’t find anything, it returns None:
print(soup.find("nosuchtag"))

None


In [152]:
soup.head.title
soup.find("head").find("title")

<title>The Dormouse's story</title>

#### find_parents() and find_parent()
#### Signature: find_parents(name, attrs, string, limit, **kwargs)
#### Signature: find_parent(name, attrs, string, **kwargs)

In [153]:
a_string = soup.find(string="Lacie")
a_string

'Lacie'

In [154]:
# <html><head><title>The Dormouse's story</title></head>
# <body>
# <p class="title"><b>The Dormouse's story</b></p>

# <p class="story">Once upon a time there were three little sisters; and their names were
# <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
# <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
# <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
# and they lived at the bottom of a well.</p>

# <p class="story">...</p>

a_string.find_parents("a")

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [155]:
a_string.find_parent("p")

<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

In [157]:
a_string.find_parents("p", class_="title")

[]

In [158]:
a_string.find("p")

-1

#### find_next_siblings() and find_next_sibling()
#### Signature: find_next_siblings(name, attrs, string, limit, **kwargs)
#### Signature: find_next_sibling(name, attrs, string, **kwargs)

In [159]:
first_link = soup.a
first_link

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [160]:
first_link.find_next_siblings("a")

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [162]:
first_story_paragraph = soup.find("p", "story")
first_story_paragraph.find_next_sibling("p")

<p class="story">...</p>

#### find_previous_siblings() and find_previous_sibling()
#### Signature: find_previous_siblings(name, attrs, string, limit, **kwargs)
#### Signature: find_previous_sibling(name, attrs, string, **kwargs)

In [163]:
last_link = soup.find("a", id="link3")
last_link

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

In [164]:
last_link.find_previous_siblings("a")

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [166]:
first_story_paragraph = soup.find("p", "story")
first_story_paragraph.find_previous_sibling("p")

<p class="title"><b>The Dormouse's story</b></p>

#### find_all_next() and find_next()
#### Signature: find_all_next(name, attrs, string, limit, **kwargs)
#### Signature: find_next(name, attrs, string, **kwargs)

In [167]:
first_link = soup.a
first_link

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [168]:
first_link.find_all_next(string=True)

['Elsie',
 ',\n',
 'Lacie',
 ' and\n',
 'Tillie',
 ';\nand they lived at the bottom of a well.',
 '\n',
 '...',
 '\n']

In [169]:
first_link.find_next("p")

<p class="story">...</p>

#### find_all_previous() and find_previous()¶
#### Signature: find_all_previous(name, attrs, string, limit, **kwargs)
#### Signature: find_previous(name, attrs, string, **kwargs)

In [170]:
first_link = soup.a
first_link

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [171]:
first_link.find_all_previous("p")

[<p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="title"><b>The Dormouse's story</b></p>]

In [172]:
first_link.find_previous("title")

<title>The Dormouse's story</title>

### CSS selectors¶