# webscraping using Beautifulsoup

In [22]:
html = '<!DOCTYPE html>\
<html>\
<head>\
<title>Testing Web Page </title>\
</head>\
<body>\
<h1> Web Scraping </h1>\
<p id = "first_para">\
Let\'s start learning \
<b>\
Web Scraping\
</b>\
</p>\
<p class = "abc" id = "second_para">\
You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a>\
</p>\
<p class = "abc">\
<a href = "https://codingninjas.in/"> Coding Ninjas </a>\
</p>\
</body>\
</html>'

## BeautifulSoup library is present in the package bs4

In [23]:
from bs4 import BeautifulSoup as bs

In [24]:
data = bs(html,'html.parser')
data

<!DOCTYPE html>
<html><head><title>Testing Web Page </title></head><body><h1> Web Scraping </h1><p id="first_para">Let's start learning <b>Web Scraping</b></p><p class="abc" id="second_para">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p><p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p></body></html>

In [25]:
type(data)

bs4.BeautifulSoup

In [26]:
print(data.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   Testing Web Page
  </title>
 </head>
 <body>
  <h1>
   Web Scraping
  </h1>
  <p id="first_para">
   Let's start learning
   <b>
    Web Scraping
   </b>
  </p>
  <p class="abc" id="second_para">
   You can read more about BeautifulSoup from
   <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/">
    here
   </a>
  </p>
  <p class="abc">
   <a href="https://codingninjas.in/">
    Coding Ninjas
   </a>
  </p>
 </body>
</html>


In [27]:
data.title

<title>Testing Web Page </title>

In [28]:
print(data.title.name)
print(data.title.string)

title
Testing Web Page 


In [35]:
# print first p tag 
print(data.p.name)
print(data.p.attrs)
print(data.p.get('id'))
print(data.p['id'])

p
{'id': 'first_para'}
first_para
first_para


In [39]:
data.get_text()

"Testing Web Page  Web Scraping Let's start learning Web ScrapingYou can read more about BeautifulSoup from  here  Coding Ninjas "

In [41]:
data.find('p')

<p id="first_para">Let's start learning <b>Web Scraping</b></p>

In [42]:
data.find_all('p')

[<p id="first_para">Let's start learning <b>Web Scraping</b></p>,
 <p class="abc" id="second_para">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p>,
 <p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p>]

### searching parse tree

In [45]:
# find using list
data.find_all(['p','id'])

[<p id="first_para">Let's start learning <b>Web Scraping</b></p>,
 <p class="abc" id="second_para">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p>,
 <p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p>]

In [47]:
#find using true it return all the tags present in the code
data.find_all(True)

[<html><head><title>Testing Web Page </title></head><body><h1> Web Scraping </h1><p id="first_para">Let's start learning <b>Web Scraping</b></p><p class="abc" id="second_para">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p><p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p></body></html>,
 <head><title>Testing Web Page </title></head>,
 <title>Testing Web Page </title>,
 <body><h1> Web Scraping </h1><p id="first_para">Let's start learning <b>Web Scraping</b></p><p class="abc" id="second_para">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p><p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p></body>,
 <h1> Web Scraping </h1>,
 <p id="first_para">Let's start learning <b>Web Scraping</b></p>,
 <b>Web Scraping</b>,
 <p class="abc" id="second_para">You can read more about BeautifulSoup from <a href="ht

In [50]:
# find using id 
data.find_all(id = ('first_para','second_para'))

[<p id="first_para">Let's start learning <b>Web Scraping</b></p>,
 <p class="abc" id="second_para">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p>]

In [49]:
# find using  class 
data.find_all(class_ = 'abc')

[<p class="abc" id="second_para">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p>,
 <p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p>]

### Going down

In [51]:
# Navigating using tag name
li = data.find_all('p')
for i in li:
    print(i)

<p id="first_para">Let's start learning <b>Web Scraping</b></p>
<p class="abc" id="second_para">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p>
<p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p>


In [53]:
# string only the value of p which is having only one child 
for i in li:
    print(i.string)

None
None
 Coding Ninjas 


In [56]:
# and strings print the list of all children of p (we print it in the from of list)
for i in li:
    print(list(i.strings))

["Let's start learning ", 'Web Scraping']
['You can read more about BeautifulSoup from ', ' here ']
[' Coding Ninjas ']


In [59]:
# stripped_string remove all the extra spacing present before and after of the strings 
for i in li:
    print(list(i.stripped_strings))

["Let's start learning", 'Web Scraping']
['You can read more about BeautifulSoup from', 'here']
['Coding Ninjas']


In [60]:
data.html

<html><head><title>Testing Web Page </title></head><body><h1> Web Scraping </h1><p id="first_para">Let's start learning <b>Web Scraping</b></p><p class="abc" id="second_para">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p><p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p></body></html>

In [66]:
# content return data in the from of list
a =data.html.contents
print(len(a))
print(a)

2
[<head><title>Testing Web Page </title></head>, <body><h1> Web Scraping </h1><p id="first_para">Let's start learning <b>Web Scraping</b></p><p class="abc" id="second_para">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p><p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p></body>]


In [71]:
# children return data in the from of iterator
a =data.html.children
for i in a :
    print(i)

<head><title>Testing Web Page </title></head>
<body><h1> Web Scraping </h1><p id="first_para">Let's start learning <b>Web Scraping</b></p><p class="abc" id="second_para">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p><p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p></body>


In [73]:
# descendants retun all the data prsent inside the tag 
b = list(data.html.descendants)
print(len(b))
print(b)

17
[<head><title>Testing Web Page </title></head>, <title>Testing Web Page </title>, 'Testing Web Page ', <body><h1> Web Scraping </h1><p id="first_para">Let's start learning <b>Web Scraping</b></p><p class="abc" id="second_para">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p><p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p></body>, <h1> Web Scraping </h1>, ' Web Scraping ', <p id="first_para">Let's start learning <b>Web Scraping</b></p>, "Let's start learning ", <b>Web Scraping</b>, 'Web Scraping', <p class="abc" id="second_para">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p>, 'You can read more about BeautifulSoup from ', <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a>, ' here ', <p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p>, <a href="https://codingninjas.i

### Going up

In [84]:
a=list(data.html.parent)
a

['html',
 <html><head><title>Testing Web Page </title></head><body><h1> Web Scraping </h1><p id="first_para">Let's start learning <b>Web Scraping</b></p><p class="abc" id="second_para">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p><p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p></body></html>]

In [85]:
a=list(data.html.parents)
a

[<!DOCTYPE html>
 <html><head><title>Testing Web Page </title></head><body><h1> Web Scraping </h1><p id="first_para">Let's start learning <b>Web Scraping</b></p><p class="abc" id="second_para">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p><p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p></body></html>]

### Going sideway

In [94]:
a = data.head.next_sibling
b = data.body.previous_sibling
print(a)

<body><h1> Web Scraping </h1><p id="first_para">Let's start learning <b>Web Scraping</b></p><p class="abc" id="second_para">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p><p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p></body>


In [95]:
print(b)

<head><title>Testing Web Page </title></head>


In [98]:
a = data.p.next_siblings
b = data.p.previous_siblings
print(list(a))

[<p class="abc" id="second_para">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p>, <p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p>]


In [99]:
print(list(b))

[<h1> Web Scraping </h1>]


### Going back and forth

In [103]:
a = data.p.next_element
b = data.p.previous_element
print(a)
print(b)

Let's start learning 
 Web Scraping 


In [104]:
a = data.p.next_elements
b = data.p.previous_elements
print(list(a))
print(list(b))

["Let's start learning ", <b>Web Scraping</b>, 'Web Scraping', <p class="abc" id="second_para">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p>, 'You can read more about BeautifulSoup from ', <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a>, ' here ', <p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p>, <a href="https://codingninjas.in/"> Coding Ninjas </a>, ' Coding Ninjas ']
[' Web Scraping ', <h1> Web Scraping </h1>, <body><h1> Web Scraping </h1><p id="first_para">Let's start learning <b>Web Scraping</b></p><p class="abc" id="second_para">You can read more about BeautifulSoup from <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"> here </a></p><p class="abc"><a href="https://codingninjas.in/"> Coding Ninjas </a></p></body>, 'Testing Web Page ', <title>Testing Web Page </title>, <head><title>Testing Web Page </title></head>, <html><head><title>Test