In [1]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

In [2]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')

In [3]:
print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


In [17]:
# Here are some simple ways to navigate that data structure:

#to print the whole tag
print(soup.title)

#to print the name of the tag
print(soup.title.name)

#to print text
print(soup.title.string)
print(soup.title.text)

#to access the parent tag
print(soup.title.parent)
print(soup.title.parent.name)

#accessing paragraph tag
print(soup.p)

#to access attributes
print(soup.p['class'])

#to access tag which are present multiple times
print(soup.a)

#to access all 'a' tags
print(soup.find_all('a'))

#to access tag on the basis of attributes
print(soup.find(id="link3"))

<title>The Dormouse's story</title>
title
The Dormouse's story
The Dormouse's story
<head><title>The Dormouse's story</title></head>
head
<p class="title"><b>The Dormouse's story</b></p>
['title']
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>


In [18]:
# print(soup.find(class="story"))#since class is an existing python keyword so this line will generate error
# solution for class attributes
print(soup.find(class_="story"))

<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>


In [19]:
#extracting all the URLs found in a tags
for link in soup.find_all('a'):
    print(link.get('href'))

http://example.com/elsie
http://example.com/lacie
http://example.com/tillie


In [20]:
# how to extract all the text from the page
print(soup.get_text())


The Dormouse's story

The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...



### If you get the ImportError “No module named HTMLParser”, your problem is that you’re running the Python 2 version of the code under Python 3.
### If you get the ImportError “No module named html.parser”, your problem is that you’re running the Python 3 version of the code under Python 2.

In [27]:
from urllib.request import urlopen
url = "https://www.crummy.com/software/BeautifulSoup/bs4/doc/"
uClient = urlopen(url)
html = uClient.read()
uClient.close()
page_soup = BeautifulSoup(html, 'lxml')
table = page_soup.find("table",{"class":"docutils"})
print(table)

<table border="1" class="docutils">
<colgroup>
<col width="18%"></col>
<col width="35%"></col>
<col width="26%"></col>
<col width="21%"></col>
</colgroup>
<tbody valign="top">
<tr class="row-odd"><td>Parser</td>
<td>Typical usage</td>
<td>Advantages</td>
<td>Disadvantages</td>
</tr>
<tr class="row-even"><td>Python’s html.parser</td>
<td><code class="docutils literal"><span class="pre">BeautifulSoup(markup,</span> <span class="pre">"html.parser")</span></code></td>
<td><ul class="first last simple">
<li>Batteries included</li>
<li>Decent speed</li>
<li>Lenient (as of Python 2.7.3
and 3.2.)</li>
</ul>
</td>
<td><ul class="first last simple">
<li>Not very lenient
(before Python 2.7.3
or 3.2.2)</li>
</ul>
</td>
</tr>
<tr class="row-odd"><td>lxml’s HTML parser</td>
<td><code class="docutils literal"><span class="pre">BeautifulSoup(markup,</span> <span class="pre">"lxml")</span></code></td>
<td><ul class="first last simple">
<li>Very fast</li>
<li>Lenient</li>
</ul>
</td>
<td><ul class="firs

#### This table summarizes the advantages and disadvantages of each parser library:
<table border="1" class="docutils">
<colgroup>
<col width="18%"></col>
<col width="35%"></col>
<col width="26%"></col>
<col width="21%"></col>
</colgroup>
<tbody valign="top">
<tr class="row-odd"><td>Parser</td>
<td>Typical usage</td>
<td>Advantages</td>
<td>Disadvantages</td>
</tr>
<tr class="row-even"><td>Python’s html.parser</td>
<td><code class="docutils literal"><span class="pre">BeautifulSoup(markup,</span> <span class="pre">"html.parser")</span></code></td>
<td><ul class="first last simple">
<li>Batteries included</li>
<li>Decent speed</li>
<li>Lenient (as of Python 2.7.3
and 3.2.)</li>
</ul>
</td>
<td><ul class="first last simple">
<li>Not very lenient
(before Python 2.7.3
or 3.2.2)</li>
</ul>
</td>
</tr>
<tr class="row-odd"><td>lxml’s HTML parser</td>
<td><code class="docutils literal"><span class="pre">BeautifulSoup(markup,</span> <span class="pre">"lxml")</span></code></td>
<td><ul class="first last simple">
<li>Very fast</li>
<li>Lenient</li>
</ul>
</td>
<td><ul class="first last simple">
<li>External C dependency</li>
</ul>
</td>
</tr>
<tr class="row-even"><td>lxml’s XML parser</td>
<td><code class="docutils literal"><span class="pre">BeautifulSoup(markup,</span> <span class="pre">"lxml-xml")</span></code>
<code class="docutils literal"><span class="pre">BeautifulSoup(markup,</span> <span class="pre">"xml")</span></code></td>
<td><ul class="first last simple">
<li>Very fast</li>
<li>The only currently supported
XML parser</li>
</ul>
</td>
<td><ul class="first last simple">
<li>External C dependency</li>
</ul>
</td>
</tr>
<tr class="row-odd"><td>html5lib</td>
<td><code class="docutils literal"><span class="pre">BeautifulSoup(markup,</span> <span class="pre">"html5lib")</span></code></td>
<td><ul class="first last simple">
<li>Extremely lenient</li>
<li>Parses pages the same way a
web browser does</li>
<li>Creates valid HTML5</li>
</ul>
</td>
<td><ul class="first last simple">
<li>Very slow</li>
<li>External Python
dependency</li>
</ul>
</td>
</tr>
</tbody>
</table>

### Beautiful Soup transforms a complex HTML document into a complex tree of Python objects. But you’ll only ever have to deal with about four kinds of objects: Tag, NavigableString, BeautifulSoup, and Comment.

In [33]:
soup = BeautifulSoup('<b id="boldest">Extremely bold</b>','lxml')
tag = soup.b
type(tag)

bs4.element.Tag

In [30]:
# Every tag has a name, accessible as .name
print(tag.name)

b


In [31]:
# If you change a tag’s name, the change will be reflected in any HTML markup generated by Beautiful Soup:
tag.name = "blockquote"
tag

<blockquote class="boldest">Extremely bold</blockquote>

In [35]:
# A tag may have any number of attributes. The tag <b id="boldest"> has an attribute “id” whose value is “boldest”. 
# You can access a tag’s attributes by treating the tag like a dictionary
print(tag['id'])

#to access the whole dictionary
print(tag.attrs)

boldest
{'id': 'boldest'}


In [36]:
# You can add, remove, and modify a tag’s attributes. Again, this is done by treating the tag as a dictionary:
tag['id'] = 'verybold'
tag['another-attribute'] = 1
tag

<b another-attribute="1" id="verybold">Extremely bold</b>

In [37]:
del tag['id']
del tag['another-attribute']
tag

<b>Extremely bold</b>

In [38]:
tag['id']

KeyError: 'id'

In [39]:
print(tag.get('id'))

None


In [40]:
# HTML 4 defines a few attributes that can have multiple values. HTML 5 removes a couple of them, but defines a few more.
# The most common multi-valued attribute is class (that is, a tag can have more than one CSS class).
css_soup = BeautifulSoup('<p class="body strikeout"></p>')
css_soup.p['class']



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


['body', 'strikeout']

In [41]:
# if an attribute looks like it has more than one value, but it’s not a multi-valued attribute as defined by any version 
# of the HTML standard, Beautiful Soup will leave the attribute alone
id_soup = BeautifulSoup('<p id="my id"></p>')
id_soup.p['id']

'my id'

In [44]:
# When you turn a tag back into a string, multiple attribute values are consolidated:
rel_soup = BeautifulSoup('<p>Back to the <a rel="index">homepage</a></p>')
print(rel_soup.a['rel'])
print('\n')
rel_soup.a['rel'] = ['index', 'contents']
print(rel_soup.p)

['index']


<p>Back to the <a rel="index contents">homepage</a></p>


In [48]:
# If you parse a document as XML, there are no multi-valued attributes:
xml_soup = BeautifulSoup('<p class="body strikeout"></p>', 'xml')
xml_soup.p['class']

'body strikeout'

In [50]:
# A string corresponds to a bit of text within a tag. Beautiful Soup uses the NavigableString class to contain these 
# bits of text:
print(tag.string)
print(type(tag.string))

Extremely bold
<class 'bs4.element.NavigableString'>


In [51]:
# You can’t edit a string in place, but you can replace one string with another, using replace_with():
tag.string.replace_with("No longer bold")
tag

<b>No longer bold</b>

#### A NavigableString is just like a Python Unicode string, except that it also supports some of the features described in Navigating the tree and Searching the tree.
#### The BeautifulSoup object itself represents the document as a whole. For most purposes, you can treat it as a Tag object. This means it supports most of the methods described in Navigating the tree and Searching the tree.
#### Since the BeautifulSoup object doesn’t correspond to an actual HTML or XML tag, it has no name and no attributes. But sometimes it’s useful to look at its .name, so it’s been given the special .name “[document]”:

In [52]:
soup.name

'[document]'

#### Tag, NavigableString, and BeautifulSoup cover almost everything you’ll see in an HTML or XML file, but there are a few leftover bits. The only one you’ll probably ever need to worry about is the comment:

In [53]:
markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>"
soup = BeautifulSoup(markup)
comment = soup.b.string
type(comment)

bs4.element.Comment

In [54]:
# The Comment object is just a special type of NavigableString:
comment

'Hey, buddy. Want to buy a used parser?'

### Navigating the tree

In [55]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')

In [56]:
# The simplest way to navigate the parse tree is to say the name of the tag you want. If you want the <head> tag, just say soup.head:
print(soup.head)
print(soup.title)

<head><title>The Dormouse's story</title></head>
<title>The Dormouse's story</title>


In [57]:
print(soup.body.b)

<b>The Dormouse's story</b>


In [61]:
soup.find('a')

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [62]:
# .contents and .children
# A tag’s children are available in a list called .contents:
head_tag = soup.head
head_tag

<head><title>The Dormouse's story</title></head>

In [64]:
head_tag.contents

[<title>The Dormouse's story</title>]

In [65]:
title_tag = head_tag.contents[0]
title_tag

<title>The Dormouse's story</title>

In [66]:
title_tag.contents

["The Dormouse's story"]

In [69]:
len(soup.contents)

2

In [72]:
soup.contents[0].name

In [73]:
# A string does not have .contents, because it can’t contain anything:
text = title_tag.contents[0]
text.contents

AttributeError: 'NavigableString' object has no attribute 'contents'

In [74]:
# .descendants
# The .contents and .children attributes only consider a tag’s direct children. For instance, the <head> tag has a single
# direct child–the <title> tag
head_tag.contents

[<title>The Dormouse's story</title>]

In [75]:
for child in head_tag.descendants:
    print(child)

<title>The Dormouse's story</title>
The Dormouse's story


In [77]:
print(len(list(soup.children)),len(list(soup.descendants)))

2 27


In [78]:
# string
# If a tag has only one child, and that child is a NavigableString, the child is made available as .string:
title_tag.string

"The Dormouse's story"

In [79]:
# If a tag’s only child is another tag, and that tag has a .string, then the parent tag is considered to have the same .
# string as its child:
head_tag.string

"The Dormouse's story"

In [80]:
# If a tag contains more than one thing, then it’s not clear what .string should refer to, so .string is defined to be None:
print(soup.html.string)

None


In [81]:
# .strings and stripped_strings
# If there’s more than one thing inside a tag, you can still look at just the strings. Use the .strings generator:
for string in soup.strings:
    print(repr(string))

'\n'
"The Dormouse's story"
'\n'
'\n'
"The Dormouse's story"
'\n'
'Once upon a time there were three little sisters; and their names were\n'
'Elsie'
',\n'
'Lacie'
' and\n'
'Tillie'
';\nand they lived at the bottom of a well.'
'\n'
'...'
'\n'


In [82]:
# These strings tend to have a lot of extra whitespace, which you can remove by using the .stripped_strings generator instead:
for string in soup.stripped_strings:
    print(repr(string))

"The Dormouse's story"
"The Dormouse's story"
'Once upon a time there were three little sisters; and their names were'
'Elsie'
','
'Lacie'
'and'
'Tillie'
';\nand they lived at the bottom of a well.'
'...'


In [83]:
# You can access an element’s parent with the .parent attribute. In the example “three sisters” document, the <head> tag is the parent of the <title> tag:
title_tag = soup.title
title_tag

<title>The Dormouse's story</title>

In [84]:
title_tag.parent

<head><title>The Dormouse's story</title></head>

In [85]:
title_tag.string.parent

<title>The Dormouse's story</title>

In [86]:
html_tag = soup.html
type(html_tag.parent)

bs4.BeautifulSoup

In [87]:
print(soup.parent)

None


In [89]:
# You can iterate over all of an element’s parents with .parents. This example uses .parents to travel from an <a> tag buried deep within the document, to the very top of the document:
link = soup.a
link

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [90]:
for parent in link.parents:
    if parent is None:
        print(parent)
    else:
        print(parent.name)

p
body
html
[document]


In [91]:
# Going sideways
sibling_soup = BeautifulSoup("<a><b>text1</b><c>text2</c></b></a>")
print(sibling_soup.prettify())

<html>
 <body>
  <a>
   <b>
    text1
   </b>
   <c>
    text2
   </c>
  </a>
 </body>
</html>


In [92]:
# .next_sibling and .previous_sibling
sibling_soup.b.next_sibling

<c>text2</c>

In [93]:
sibling_soup.c.previous_sibling

<b>text1</b>

In [95]:
print(sibling_soup.b.previous_sibling)
print(sibling_soup.c.next_sibling)

None
None


In [96]:
# In real documents, the .next_sibling or .previous_sibling of a tag will usually be a string containing whitespace.
link = soup.a
link.next_sibling

',\n'

In [97]:
link.next_sibling.next_sibling

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>

In [98]:
for sibling in soup.a.next_siblings:
    print(repr(sibling))

',\n'
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
' and\n'
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
';\nand they lived at the bottom of a well.'


In [100]:
for sibling in soup.find(id="link3").previous_siblings:
    print(repr(sibling))

' and\n'
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
',\n'
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
'Once upon a time there were three little sisters; and their names were\n'


In [102]:
# Going back and forth
last_a_tag = soup.find("a", id="link3")
last_a_tag.next_sibling

';\nand they lived at the bottom of a well.'

In [103]:
last_a_tag.next_element

'Tillie'

In [104]:
last_a_tag.previous_element

' and\n'

In [105]:
for element in last_a_tag.next_elements:
    print(repr(element))

'Tillie'
';\nand they lived at the bottom of a well.'
'\n'
<p class="story">...</p>
'...'
'\n'
