In [1]:
# doc HTML
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

In [2]:
import re
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')

## Aller sur une balise suivant son nom

In [3]:
soup.select("body a")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [4]:
print("balise.enfant =         ",soup.title)
print("balise.texte =          ",soup.title.string) # OU .get_text() OU soup.body.contents
print("parent de la balise =   ",soup.title.parent)

balise.enfant =          <title>The Dormouse's story</title>
balise.texte =           The Dormouse's story
parent de la balise =    <head><title>The Dormouse's story</title></head>


In [5]:
print("balise et son contenu = \n",soup.head.prettify())

balise et son contenu = 
 <head>
 <title>
  The Dormouse's story
 </title>
</head>



In [6]:
# textes de la page contennant un mot
soup.find_all(string=re.compile("Dormouse"))

["The Dormouse's story", "The Dormouse's story"]

## Acceder à des balises lointaines suivant leur nom ou leurs caractéristiques

In [7]:
print("descendants nommés ...: ",soup.find_all('a'))

descendants nommés ...:  [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


In [8]:
# prendre les balises suivant leur nom
for tag in soup.find_all(re.compile("t")):
    print(tag.name)

html
title


In [9]:
print("descendants avec id:  ",soup.find(id="link3"))
print("descendants avec id:  ",soup.select("#link3"))

descendants avec id:   <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
descendants avec id:   [<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


In [10]:
# prendre les balises ayant une classe
soup.find_all("p","story")

[<p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [11]:
# prendre les balises suivant une classe
soup.find_all(attrs={"class": "story"})

[<p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [12]:
# prendre les balises suivant leur caract
soup.find_all(href=re.compile("elsie"))

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [13]:
# balise contennant le texte
soup("a", string="Elsie")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

## Caractéristiques d'une balise

In [14]:
# attributs d'une balise
for i in soup.find_all(id="link3"):
    print(i.attrs)

{'href': 'http://example.com/tillie', 'class': ['sister'], 'id': 'link3'}


In [15]:
print("caractéristique d'une balise: ")
for link in soup.find_all('a'):
    print(link.get('href'))

caractéristique d'une balise: 
http://example.com/elsie
http://example.com/lacie
http://example.com/tillie


## Enfants, frères et parents d'une balise

#### Les enfants

In [16]:
# Les enfants d'une balise
for child in soup.head.children:
    print(child)

<title>The Dormouse's story</title>


In [17]:
# Deuxième enfant d'une balise
soup.body.next_element.next_element

<p class="title"><b>The Dormouse's story</b></p>

In [18]:
# les enfants d'une balise
for child in soup.head.descendants:
    print(child, end="\n\n")

<title>The Dormouse's story</title>

The Dormouse's story



In [19]:
# prendre des descendants
soup.body.find_all(["a", "b"])

[<b>The Dormouse's story</b>,
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

#### Les frères

In [20]:
# acceder au frère suivant    (previous_sibling (frère précédant))
soup.head.next_sibling.next_sibling

<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body>

#### Le parent

In [21]:
# revenir sur le parent de notre élément
soup.head.title.find_parent()

<head><title>The Dormouse's story</title></head>