In [1]:
import requests 

In [2]:
page = requests.get("http://dataquestio.github.io/web-scraping-pages/simple.html")
page

<Response [200]>

In [3]:
page.status_code

200

In [10]:
print(page.content)

b'<!DOCTYPE html>\n<html>\n    <head>\n        <title>A simple example page</title>\n    </head>\n    <body>\n        <p>Here is some simple content for this page.</p>\n    </body>\n</html>'


In [6]:
from bs4 import BeautifulSoup

In [7]:
soup=BeautifulSoup(page.content,'html.parser')

In [9]:
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <p>
   Here is some simple content for this page.
  </p>
 </body>
</html>


In [11]:
list(soup.children) # children returns a list generator, so we need to call the list function on it

['html', '\n', <html>
 <head>
 <title>A simple example page</title>
 </head>
 <body>
 <p>Here is some simple content for this page.</p>
 </body>
 </html>]

In [12]:
[type(item) for item in list(soup.children)]

[bs4.element.Doctype, bs4.element.NavigableString, bs4.element.Tag]

In [14]:
html = list(soup.children)[2] # The most important object type, and the one we’ll deal with most often, is the Tag object.
html

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<p>Here is some simple content for this page.</p>
</body>
</html>

In [16]:
list(html.children) 
#Each item in the list returned by the children property is also a BeautifulSoup object, 
#so we can also call the children method on html.

['\n', <head>
 <title>A simple example page</title>
 </head>, '\n', <body>
 <p>Here is some simple content for this page.</p>
 </body>, '\n']

In [19]:
body=list(html.children)[3]
body

<body>
<p>Here is some simple content for this page.</p>
</body>

In [22]:
list(body.children)

['\n', <p>Here is some simple content for this page.</p>, '\n']

In [25]:
p=list(body.children)[1]

In [26]:
p.get_text()

'Here is some simple content for this page.'

# Finding all instances of a tag at once

What we did above was useful for figuring out how to navigate a page, but it took a lot of commands to do something fairly simple. If we want to extract a single tag, we can instead use the find_all method, which will find all the instances of a tag on a page.

In [27]:
soup1=BeautifulSoup(page.content,'html.parser')
soup1.find_all('p')

[<p>Here is some simple content for this page.</p>]

In [29]:
soup1.find_all('p')[0].get_text()

'Here is some simple content for this page.'

If you instead only want to find the first instance of a tag, you can use the find method, which will return a single BeautifulSoup object:

In [31]:
soup1.find('p').get_text() #here it returns only one element so we can use get_text() directly.

'Here is some simple content for this page.'

# Searching for tags by class and id

In [44]:
page = requests.get("http://dataquestio.github.io/web-scraping-pages/ids_and_classes.html")
soup2 = BeautifulSoup(page.content, 'html.parser')
print(soup2.prettify())

<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <div>
   <p class="inner-text first-item" id="first">
    First paragraph.
   </p>
   <p class="inner-text">
    Second paragraph.
   </p>
  </div>
  <p class="outer-text first-item" id="second">
   <b>
    First outer paragraph.
   </b>
  </p>
  <p class="outer-text">
   <b>
    Second outer paragraph.
   </b>
  </p>
 </body>
</html>


In [35]:
soup2.find_all(class_='outer-text')

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>, <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [36]:
soup2.find_all('p', class_='outer-text')

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>, <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [37]:
soup2.find(class_='outer-text')

<p class="outer-text first-item" id="second">
<b>
                First outer paragraph.
            </b>
</p>

In [38]:
soup2.find_all(id='second')

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>]

# Using CSS Selectors

In [41]:
soup2.select('div p')

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>, <p class="inner-text">
                 Second paragraph.
             </p>]

In [43]:
for text in soup2.select('div p'):
    print(text.get_text())


                First paragraph.
            

                Second paragraph.
            
