In [1]:
from bs4 import BeautifulSoup

# Quick Start

In [2]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

soup = BeautifulSoup(html_doc, 'html.parser')

print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


In [3]:
soup


<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>

### Simple Navigate ways

In [4]:
# Whole element
soup.title

<title>The Dormouse's story</title>

In [5]:
# Tag name
soup.title.name

'title'

In [6]:
# Inside tag
soup.title.string

"The Dormouse's story"

In [7]:
# Parent tag name
soup.title.parent.name

'head'

In [8]:
# First paragraph
soup.p

<p class="title"><b>The Dormouse's story</b></p>

**What about returning all of the paragraphs!?**

In [9]:
soup.find_all('p')

[<p class="title"><b>The Dormouse's story</b></p>,
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [10]:
# tag class(es)
soup.p['class']

['title']

In [11]:
# First link
soup.a

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [12]:
# All links
soup.find_all('a')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [13]:
# Link with a specific id
soup.find(id='link3')

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

In [14]:
# Get all the URLs
links = []
for link in soup.find_all('a'):
    links.append(link.get('href'))
links

['http://example.com/elsie',
 'http://example.com/lacie',
 'http://example.com/tillie']

In [15]:
# Get all the texts
print(soup.get_text())


The Dormouse's story

The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...



# Making the soup

## Approach 1
by filehandler

In [16]:
with open('index.html', 'r') as fp:
    soup = BeautifulSoup(fp)

print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <title>
   Test Document
  </title>
 </head>
 <body>
  <h1>
   Test heading
  </h1>
  <p>
   Test paragraph
  </p>
 </body>
</html>


## Approach 2
As a string

In [17]:
soup = BeautifulSoup("<html>Data</html>")
print(soup.prettify())

<html>
 <body>
  <p>
   Data
  </p>
 </body>
</html>


# Kinds of objects

## Tag

In [18]:
soup = BeautifulSoup('<b class="boldest">Extremely bold</b>')
tag = soup.b
type(tag)

bs4.element.Tag

## Name

In [19]:
tag.name

'b'

In [20]:
tag.name = "p"
tag

<p class="boldest">Extremely bold</p>

## Attributes

In [21]:
tag['class']

['boldest']

In [22]:
# Access as a dictionary
tag.attrs

{'class': ['boldest']}

In [23]:
tag['class'] = 'verybold'
tag['id'] = 'boldId'
tag

<p class="verybold" id="boldId">Extremely bold</p>

In [24]:
del tag['id']
del tag['class']
tag

<p>Extremely bold</p>

## Multi-valued attributes

In [25]:
# Single-valued attribute
css_soup = BeautifulSoup('<p class="body"></p>')
css_soup.p['class']

['body']

In [26]:
# Multi-valued attribute
css_soup = BeautifulSoup('<p class="body strikeout"></p>')
css_soup.p['class']

['body', 'strikeout']

In [41]:
css_soup.p['class'].pop()
css_soup

<html><body><p class="body"></p></body></html>

**Multi-valued attributes that are not standard in HTML are considered as a single attribute**

In [27]:
id_soup = BeautifulSoup('<p id="my id"></p>')
id_soup.p['id']

'my id'

In [28]:
id_soup.p['id'] = ['my', 'second', 'id']
id_soup

<html><body><p id="my second id"></p></body></html>

Apparently, setting a list (multi-valued attribute) on a single-valued attribute such as `id` is not a problem.

In [29]:
# Consider multi-valued attribute as a single-valued attribute
no_list_soup = BeautifulSoup('<p class="body strikeout"></p>', 'html', multi_valued_attributes=None)
no_list_soup.p['class']

'body strikeout'

In [30]:
# Always returns a list(Even single-valued)
no_list_soup.p.get_attribute_list('class')

['body strikeout']

**Parse as XML has no multi-valued attribute**

In [31]:
xml_soup = BeautifulSoup('<p class="body strikeout"></p>', 'xml')
xml_soup.p['class']

'body strikeout'

In [32]:
#Multi-valued and XML
class_is_multi= { '*' : 'class'}
xml_soup = BeautifulSoup('<p class="body strikeout"></p>', 'xml', multi_valued_attributes=class_is_multi)
xml_soup.p['class']

['body', 'strikeout']

## NavigatableString


In [33]:
tag, tag.string, type(tag.string)

(<p>Extremely bold</p>, 'Extremely bold', bs4.element.NavigableString)

In [34]:
tag.string.replace_with("No longer bold")
tag

<p>No longer bold</p>

## BeautifulSoup
We can consider it as a **Tag** object.

In [35]:
doc = BeautifulSoup("<document><content/>INSERT FOOTER HERE</document", "xml")
footer = BeautifulSoup("<footer>Here's the footer</footer>", "xml")
doc.find(text="INSERT FOOTER HERE").replace_with(footer)

print(doc.prettify())

<?xml version="1.0" encoding="utf-8"?>
<document>
 <content/>
 <footer>
  Here's the footer
 </footer>
</document>


In [36]:
soup.name

'[document]'

## Comments and other special strings

In [37]:
markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>"
soup = BeautifulSoup(markup)
comment = soup.b.string
type(comment), comment

(bs4.element.Comment, 'Hey, buddy. Want to buy a used parser?')

In [38]:
print(soup.prettify())

<html>
 <body>
  <b>
   <!--Hey, buddy. Want to buy a used parser?-->
  </b>
 </body>
</html>
