Beautiful Soup is a Python library that is commonly used for web scraping purposes. It allows you to extract data from HTML and XML documents by providing convenient methods and tools for parsing and navigating through the document's structure.

In [1]:
!pip install beautifulsoup4



In [5]:
from bs4 import BeautifulSoup

# Assuming you have an HTML document stored in the 'html_doc' variable
html_doc = """
<html>
<head>
    <title>osAndroid</title>
</head>
<body>
    <div id="content">
        <h1>Heading</h1>
        <p>Hello I am Osanda.</p>
        <ul>
            <li>Item 1</li>
            <li>Item 2</li>
            <li>Item 3</li>
        </ul>

        <ol>
            <li>Item 1</li>
            <li>Item 2</li>
            <li>Item 3</li>

        </ol>
    </div>
</body>
</html>
"""

# Create a BeautifulSoup object
soup = BeautifulSoup(html_doc, 'html.parser')

# Extract the title of the document
title = soup.title.string
print("Title:", title)

# Extract the text of the first paragraph
paragraph = soup.p.string
print("Paragraph:", paragraph)

# Extract the text of each list item
list_items = soup.find_all('li')
print("List Items:")
for item in list_items:
    print(item.string)


Title: osAndroid
Paragraph: Hello I am Osanda.
List Items:
Item 1
Item 2
Item 3
Item 1
Item 2
Item 3


In [10]:
html_doc_001 = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
</body>
</html>
"""
# Create a BeautifulSoup object
soup = BeautifulSoup(html_doc_001, 'html.parser')

# Extract the title of the document
title = soup.title.string
print("Title:", title)

# Extract the text of the first paragraph
paragraph = soup.p.string
print("Paragraph:", paragraph)

# Extract the text of each list item
list_items = soup.find_all('li')
print("List Items:")
for item in list_items:
    print(item.string)

links = soup.find_all('a href')
print("Links:")
for item in links:
    print(item.string)


Title: The Dormouse's story
Paragraph: The Dormouse's story
List Items:
Links:


In [11]:

print(soup.prettify())


<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>

