In [1]:
html_doc = """<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

In [56]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')

print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>



In [5]:
soup.title

<title>The Dormouse's story</title>

In [6]:
soup.title.name

'title'

In [7]:
soup.title.string

"The Dormouse's story"

In [8]:
soup.title.parent

<head><title>The Dormouse's story</title></head>

In [9]:
soup.title.parent.name

'head'

In [26]:
soup.p

<p class="title"><b>The Dormouse's story</b></p>

In [11]:
soup.p.name

'p'

In [13]:
soup.p['class']

['title']

In [14]:
soup.a

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [15]:
soup.find_all('a')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [16]:
soup.find(id='link3')

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

In [None]:
# print all text on page
print(soup.get_text())

The Dormouse's story

The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...



In [19]:
#extract all URLs from the page (found within <a> tags)
for link in soup.find_all('a'):
    print(link.get('href'))

http://example.com/elsie
http://example.com/lacie
http://example.com/tillie


In [20]:
#extract all URLs from the page (found within <a> tags)
for link in soup.find_all('a'):
    print(link)

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>


In [22]:
#extract all URLs from the page (found within <a> tags)
for link in soup.find_all('a'):
    print(link.get('id'))

link1
link2
link3


In [23]:
#extract all URLs from the page (found within <a> tags)
for link in soup.find_all('a'):
    print(link.id)

None
None
None


## Kinds Of Objects

### class Tag

In [27]:
# A Tag object corresponds to an XML or HTML tag in the original document.
tag1 = soup.p
print(tag1)

<p class="title"><b>The Dormouse's story</b></p>


In [31]:
# Every tag has a name
print(tag1.name)
tag1.name = "blockquote"
tag1

p


<blockquote class="title"><b>The Dormouse's story</b></blockquote>

An HTML or XML tag may have any number of attributes. The tag <b id="boldest"> has an attribute "id" whose value is "boldest". You can access a tag's attributes by treating the tag like a dictionary

In [33]:
tag1.attrs

{'class': ['title']}

In [35]:
tag2 = soup.a
tag2

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [36]:
tag2.attrs

{'href': 'http://example.com/elsie', 'class': ['sister'], 'id': 'link1'}

In [None]:
# deleting a tag's attribute
del tag2['href']
tag2

<a class="sister" id="link1">Elsie</a>

#### Multi-valued attributes

By default, Beautiful Soup stores the value(s) of a multi-valued attribute as a list

In [None]:
css_soup = BeautifulSoup('<p class="body strikeout"></p>', 'html.parser')
css_soup.p['class']

['body', 'strikeout']

If an attribute looks like it has more than one value, but it's not a multi-valued attribute as defined by any version of the HTML standard, Beautiful Soup stores it as a simple string

In [40]:
id_soup = BeautifulSoup('<p id="my id"></p>', 'html.parser')
id_soup.p['id']

'my id'

In [42]:
# force all attributes to be stored as strings
no_list_soup = BeautifulSoup('<p class="body strikeout"></p>', 'html.parser', multi_valued_attributes=None)
no_list_soup.p['class']

'body strikeout'

In [43]:
# force all attributes to be stored as list
class_is_multi= { '*' : 'class'}
xml_soup = BeautifulSoup('<p class="body strikeout"></p>', 'xml', multi_valued_attributes=class_is_multi)
xml_soup.p['class']

['body', 'strikeout']

### class NavigableString

In [55]:
soup1 = BeautifulSoup('<b class="boldest">Extremely bold</b>', 'html.parser')
tag = soup1.b
print(tag.string)
print(type(tag.string))

Extremely bold
<class 'bs4.element.NavigableString'>


## .content

In [49]:
print(tag1)
print(tag1.contents)

<blockquote class="title"><b>The Dormouse's story</b></blockquote>
[<b>The Dormouse's story</b>]


In [60]:
for string in soup.strings:
    print(string)

The Dormouse's story




The Dormouse's story


Once upon a time there were three little sisters; and their names were

Elsie
,

Lacie
 and

Tillie
;
and they lived at the bottom of a well.


...




In [62]:
# remove extra whitespace by using the .stripped_strings generator instead
for string in soup.stripped_strings:
    print(repr(string))

"The Dormouse's story"
"The Dormouse's story"
'Once upon a time there were three little sisters; and their names were'
'Elsie'
','
'Lacie'
'and'
'Tillie'
';\nand they lived at the bottom of a well.'
'...'


In [63]:
for string in soup.stripped_strings:
    print(string)

The Dormouse's story
The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie
,
Lacie
and
Tillie
;
and they lived at the bottom of a well.
...


## Kinds of filter

In [64]:
# This code finds all the tags whose names start with the letter "b"
import re
for tag in soup.find_all(re.compile("^b")):
    print(tag.name)

body
b


In [65]:
# find all the tags in the document, but none of the text strings
for tag in soup.find_all(True):
    print(tag.name)

html
head
title
body
p
b
p
a
a
a
p


In [66]:
#  function that returns True if a tag defines the "class" attribute but doesn't define the "id" attribute
def has_class_but_no_id(tag):
    return tag.has_attr('class') and not tag.has_attr('id')
soup.find_all(has_class_but_no_id)

[<p class="title"><b>The Dormouse's story</b></p>,
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [67]:
# finds all the <a> tags and all the <b> tags
soup.find_all(["a", "b"])

[<b>The Dormouse's story</b>,
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

### searching by css class

In [68]:
soup.find_all("a", class_="sister")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [73]:
# To search for tags that match two or more CSS classes at once
print(css_soup.select("p.strikeout.body"))
print(css_soup.select("strikeout.body.p"))
print(css_soup.select("p.body.strikeout"))
print(css_soup.select("p.body.strikeou.abct")) # will look for  tags having exact match

[<p class="body strikeout"></p>]
[]
[<p class="body strikeout"></p>]
[]


### limit argument

In [78]:
soup.find_all("a")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [79]:
soup.find_all("a", limit=2)

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

## CSS selectors

In [82]:
# return tags
soup.css.select("title")

[<title>The Dormouse's story</title>]

In [84]:
soup.css.select("html head title")

[<title>The Dormouse's story</title>]

In [86]:
# find tags by ID
soup.css.select("#link1")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

### Find tags by CSS class

In [87]:
soup.css.select(".sister")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [90]:
soup.css.select("[class~=sister]")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

Find tags that match any selector from a list of selectors

In [91]:
soup.css.select("#link1,#link2")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

## Parsers

In [96]:
print(BeautifulSoup("<a></p>", "html.parser"))
print(BeautifulSoup("<a></p>", "lxml"))
print(BeautifulSoup("<a></p>", "html5lib")) # <html><head></head><body><a><p></p></a></body></html>

<a></a>
<html><body><a></a></body></html>


FeatureNotFound: Couldn't find a tree builder with the features you requested: html5lib. Do you need to install a parser library?