In [1]:
from bs4 import BeautifulSoup

In [2]:
html_doc = """<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

soup = BeautifulSoup(html_doc, 'html.parser')

In [3]:
print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


# Gowing down

## navigating using tag names
Gives only the first element that it finds

In [4]:
soup.head, soup.head.title

(<head><title>The Dormouse's story</title></head>,
 <title>The Dormouse's story</title>)

In [5]:
soup.body.b

<b>The Dormouse's story</b>

In [6]:
soup.a

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [7]:
# get all
soup.find_all('a')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

### `.contents` and `.children`

children are available in a list called `.contents`

In [8]:
head_tag = soup.head
head_tag

<head><title>The Dormouse's story</title></head>

In [9]:
head_tag.contents

[<title>The Dormouse's story</title>]

In [10]:
title_tag = head_tag.contents[0]
title_tag

<title>The Dormouse's story</title>

In [11]:
title_tag.contents

["The Dormouse's story"]

In [12]:
# Soup object has one child -> html
soup.contents[0].name

'html'

In [13]:
text = title_tag.contents[0]
text.contents

AttributeError: 'NavigableString' object has no attribute 'contents'

In [14]:
# Children is a generator
for element in soup.body.children:
    if element.name == 'p':
        print(element)
        print('---')

<p class="title"><b>The Dormouse's story</b></p>
---
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
---
<p class="story">...</p>
---


### `.descendants`
A **generator** that has all the **descendants**

In [15]:
for descendant in head_tag.descendants:
    print(descendant)

<title>The Dormouse's story</title>
The Dormouse's story


In [16]:
story_paragraph = soup.find('p', {"class": "story"})
print(story_paragraph.prettify())

<p class="story">
 Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">
  Elsie
 </a>
 ,
 <a class="sister" href="http://example.com/lacie" id="link2">
  Lacie
 </a>
 and
 <a class="sister" href="http://example.com/tillie" id="link3">
  Tillie
 </a>
 ;
and they lived at the bottom of a well.
</p>



In [17]:
# Order of traverse
for idx, element in enumerate(story_paragraph.descendants):
    print(f"{idx + 1}: {element}")

1: Once upon a time there were three little sisters; and their names were

2: <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
3: Elsie
4: ,

5: <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
6: Lacie
7:  and

8: <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
9: Tillie
10: ;
and they lived at the bottom of a well.


### `.string`

In [18]:
# Only one string child -> returns that
title_tag, title_tag.string

(<title>The Dormouse's story</title>, "The Dormouse's story")

In [19]:
# Has one tag child and that child has a string child -> returns that string
head_tag, head_tag.string

(<head><title>The Dormouse's story</title></head>, "The Dormouse's story")

In [20]:
# has more than one child -> Returns None
print(soup.html.string)

None


### `.strings`
Generator useful when we have more than one tag in children

In [21]:
for string in soup.strings:
    print(repr(string))

"The Dormouse's story"
'\n'
'\n'
"The Dormouse's story"
'\n'
'Once upon a time there were three little sisters; and their names were\n'
'Elsie'
',\n'
'Lacie'
' and\n'
'Tillie'
';\nand they lived at the bottom of a well.'
'\n'
'...'
'\n'


Have lots of extra white spaces. **solution?**
### `.stripped_strings`

In [22]:
for string in soup.stripped_strings:
    print(repr(string))

"The Dormouse's story"
"The Dormouse's story"
'Once upon a time there were three little sisters; and their names were'
'Elsie'
','
'Lacie'
'and'
'Tillie'
';\nand they lived at the bottom of a well.'
'...'


# Going up

## `.parent`

In [23]:
title_tag = soup.title
title_tag, title_tag.parent

(<title>The Dormouse's story</title>,
 <head><title>The Dormouse's story</title></head>)

In [24]:
title_tag.string.parent

<title>The Dormouse's story</title>

In [25]:
type(soup.html.parent)

bs4.BeautifulSoup

In [26]:
print(soup.parent)

None


## `.parents`
It's a generator as expeccted.

In [27]:
link = soup.a
link

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [28]:
for parent in link.parents:
    print(parent.name)

p
body
html
[document]


# Going sideways

In [29]:
sibling_soup = BeautifulSoup("<a><b>text1</b><c>text2</c></b></a>")
print(sibling_soup.prettify())

<html>
 <body>
  <a>
   <b>
    text1
   </b>
   <c>
    text2
   </c>
  </a>
 </body>
</html>


### `.next_sibling` and `.previous_sibling`

In [30]:
sibling_soup.b.next_sibling

<c>text2</c>

In [31]:
sibling_soup.c.previous_sibling

<b>text1</b>

#### when having white spaces in document

In [32]:
link = soup.a
link

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [33]:
link.next_sibling

',\n'

In [34]:
link.next_sibling.next_sibling

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>

### `.next_siblings` and `.previous_siblings`

In [35]:
for sibling in soup.a.next_siblings:
    print(repr(sibling))

',\n'
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
' and\n'
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
';\nand they lived at the bottom of a well.'


In [36]:
for sibling in soup.find(id="link3").previous_siblings:
    print(repr(sibling))

' and\n'
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
',\n'
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
'Once upon a time there were three little sisters; and their names were\n'


# Going back and forth

### `.next_element` and `.previous_element`

In [37]:
last_a_tag = soup.find("a", id="link3")
last_a_tag

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

In [38]:
last_a_tag.next_sibling, last_a_tag.next_element

(';\nand they lived at the bottom of a well.', 'Tillie')

In [39]:
last_a_tag.previous_sibling, last_a_tag.previous_element


(' and\n', ' and\n')

### `.next_elements` and `.previous_elements`

In [40]:
for element in last_a_tag.next_elements:
    print(repr(element))

'Tillie'
';\nand they lived at the bottom of a well.'
'\n'
<p class="story">...</p>
'...'
'\n'
