In [1]:
html = '<!DOCTYPE html>\
<html>\
<head>\
<title> Testing Web Page </title>\
</head>\
<body>\
<h1> Web Scraping </h1>\
<p class = "abc" id = "first_para">\
Let \'s start learning\
<b>\
Web Scraping\
</b>\
</p>\
<p id = "def">\
You can read more about BeautifulSoup from <a href = "https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a>\
</p>\
<p class = "abc">\
<a href = "https://codingninjas.in/"> Coding Ninjas </a>\
</p>\
</body>\
</html>'

In [2]:
from bs4 import BeautifulSoup
data = BeautifulSoup(html, 'html.parser')
data.find_all(['p', 'a'])

[<p class="abc" id="first_para">Let 's start learning<b>Web Scraping</b></p>,
 <p id="def">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p>,
 <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a>,
 <p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p>,
 <a href="https://codingninjas.in/"> Coding Ninjas </a>]

In [3]:
data.find_all(True) # To find out each and every tag present in my document

[<html><head><title> Testing Web Page </title></head><body><h1> Web Scraping </h1><p class="abc" id="first_para">Let 's start learning<b>Web Scraping</b></p><p id="def">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p><p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p></body></html>,
 <head><title> Testing Web Page </title></head>,
 <title> Testing Web Page </title>,
 <body><h1> Web Scraping </h1><p class="abc" id="first_para">Let 's start learning<b>Web Scraping</b></p><p id="def">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p><p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p></body>,
 <h1> Web Scraping </h1>,
 <p class="abc" id="first_para">Let 's start learning<b>Web Scraping</b></p>,
 <b>Web Scraping</b>,
 <p id="def">You can read more about BeautifulSoup from <a href="https://www.crummy.com/

In [4]:
data.find_all(id = 'first_para')

[<p class="abc" id="first_para">Let 's start learning<b>Web Scraping</b></p>]

In [5]:
data.find_all(class_ = 'abc')

[<p class="abc" id="first_para">Let 's start learning<b>Web Scraping</b></p>,
 <p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p>]

In [6]:
# We can pass CSS selectors by : data.select('selector_value')

In [7]:
print(data.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   Testing Web Page
  </title>
 </head>
 <body>
  <h1>
   Web Scraping
  </h1>
  <p class="abc" id="first_para">
   Let 's start learning
   <b>
    Web Scraping
   </b>
  </p>
  <p id="def">
   You can read more about BeautifulSoup from
   <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/">
    here
   </a>
  </p>
  <p class="abc">
   <a href="https://codingninjas.in/">
    Coding Ninjas
   </a>
  </p>
 </body>
</html>


In [8]:
data.head

<head><title> Testing Web Page </title></head>

In [9]:
data.head.title # For nested tags

<title> Testing Web Page </title>

In [10]:
data.title

<title> Testing Web Page </title>

In [11]:
data.title.string

' Testing Web Page '

In [12]:
li = data.find_all('p')
for i in li:
  print(i)

<p class="abc" id="first_para">Let 's start learning<b>Web Scraping</b></p>
<p id="def">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p>
<p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p>


In [13]:
for i in li:
  print(list(i.strings))

["Let 's start learning", 'Web Scraping']
['You can read more about BeautifulSoup from ', ' here ']
[' Coding Ninjas ']


The above method is used to print all the strings present in each paragraph tag.

In [14]:
for i in li:
  print(list(i.stripped_strings))

["Let 's start learning", 'Web Scraping']
['You can read more about BeautifulSoup from', 'here']
['Coding Ninjas']


In [15]:
arr = data.html.contents  # This gives a list
print(arr)
print(len(arr))

[<head><title> Testing Web Page </title></head>, <body><h1> Web Scraping </h1><p class="abc" id="first_para">Let 's start learning<b>Web Scraping</b></p><p id="def">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p><p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p></body>]
2


In [16]:
arr_1 = data.html.children  # This gives an iterator object
for i in arr_1:
  print(i)

<head><title> Testing Web Page </title></head>
<body><h1> Web Scraping </h1><p class="abc" id="first_para">Let 's start learning<b>Web Scraping</b></p><p id="def">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p><p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p></body>


In [18]:
arr_2 = list(data.html.descendants)
print(arr_2)
print(len(arr_2))

[<head><title> Testing Web Page </title></head>, <title> Testing Web Page </title>, ' Testing Web Page ', <body><h1> Web Scraping </h1><p class="abc" id="first_para">Let 's start learning<b>Web Scraping</b></p><p id="def">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p><p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p></body>, <h1> Web Scraping </h1>, ' Web Scraping ', <p class="abc" id="first_para">Let 's start learning<b>Web Scraping</b></p>, "Let 's start learning", <b>Web Scraping</b>, 'Web Scraping', <p id="def">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p>, 'You can read more about BeautifulSoup from ', <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a>, ' here ', <p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p>, <a href="https://codingninjas.in/"> Coding Ninj

HTML has two children and seventeen descendants. Children will return direct children of HTML only. Descendant will return all children and their children.