# **BS4 101**
source from https://beautiful-soup-4.readthedocs.io/en/latest/#quick-start

### **Quick Start**

In [2]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

In [4]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


In [12]:
print(soup.title)
print(soup.title.name)
print(soup.title.string)
print(soup.title.parent.name)
print(soup.p)
print(soup.p['class'])
print(soup.a)
print(soup.b)
print(soup.find(id = 'link3'))

<title>The Dormouse's story</title>
title
The Dormouse's story
head
<p class="title"><b>The Dormouse's story</b></p>
['title']
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
<b>The Dormouse's story</b>
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>


In [13]:
for link in soup.find_all('a'):
    print(link.get('href'))

http://example.com/elsie
http://example.com/lacie
http://example.com/tillie


In [14]:
print(soup.get_text())


The Dormouse's story

The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...



### **Kinds of objects**
working with tag, navigablestring, beautifulsoup and comment

In [20]:
soup = BeautifulSoup('<b class="boldest">Extremely bold</b>')
tag = soup.b
type(tag)

bs4.element.Tag

Tag

In [24]:
tag.name = "blockquote"
print(tag)

<blockquote class="boldest">Extremely bold</blockquote>


Attributes

In [26]:
tag['class']

['boldest']

In [27]:
tag.attrs

{'class': ['boldest']}

In [29]:
tag['id'] = 'verybold'
tag['attribute2'] = 1
tag

<blockquote attribute2="1" class="boldest" id="verybold">Extremely bold</blockquote>

Multi-valued attributes

In [30]:
css_soup = BeautifulSoup('<p class="body"></p>')
print(css_soup.p['class'])

css_soup = BeautifulSoup('<p class="body strikeout"></p>')
print(css_soup.p['class'])

['body']
['body', 'strikeout']


In [31]:
id_soup = BeautifulSoup('<p id="my id"></p>')
id_soup.p['id']

'my id'