In [1]:
import requests

In [6]:
from bs4 import BeautifulSoup

requests.get is used to download a page from the internet

In [2]:
page = requests.get("http://dataquestio.github.io/web-scraping-pages/simple.html")

In [3]:
page

<Response [200]>

In [4]:
#Status code tells us if page downloaded properly
page.status_code

200

In [5]:
page.content

b'<!DOCTYPE html>\n<html>\n    <head>\n        <title>A simple example page</title>\n    </head>\n    <body>\n        <p>Here is some simple content for this page.</p>\n    </body>\n</html>'

In [7]:
soup = BeautifulSoup(page.content,'html.parser')

In [8]:
soup

<!DOCTYPE html>

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<p>Here is some simple content for this page.</p>
</body>
</html>

In [9]:
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <p>
   Here is some simple content for this page.
  </p>
 </body>
</html>


soup.children will return a list generator of all the items in the 1st level

In [12]:
list(soup.children)

['html', '\n', <html>
 <head>
 <title>A simple example page</title>
 </head>
 <body>
 <p>Here is some simple content for this page.</p>
 </body>
 </html>]

There are 3 items in the first level: (1st html: <!DOCTYPE html> which tells the type of document page is, \n: tells us a new line was entered, <html>: tells us that eveything within this tag is html)

Creating a list that shows the type of each item in the first level

In [13]:
[type(item) for item in soup.children]

[bs4.element.Doctype, bs4.element.NavigableString, bs4.element.Tag]

Tags are the main element type.  I will be grabbing the html which is the third element in soup.children

In [15]:
html = list(soup.children)[2]
html

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<p>Here is some simple content for this page.</p>
</body>
</html>

In [17]:
list(html.children)

['\n', <head>
 <title>A simple example page</title>
 </head>, '\n', <body>
 <p>Here is some simple content for this page.</p>
 </body>, '\n']

There are 5 children, 3 of which are Navigable objects (\n representing new line).  The other two are the head and body tags

We want to get the "< p >" tag which represents a paragraph.  The <p> tag is a child of the <body> tag which is the 3rd (4th outside of Python) child of the <html> tag

In [18]:
body = list(html.children)[3]

In [20]:
list(body.children)

['\n', <p>Here is some simple content for this page.</p>, '\n']

In [21]:
para = list(body.children)[1]

In [22]:
para

<p>Here is some simple content for this page.</p>

get_text() method is used to get the text inside of a tag

In [23]:
para.get_text()

'Here is some simple content for this page.'

# Finding all tags at once

In [25]:
soup.findAll('p')

[<p>Here is some simple content for this page.</p>]

findAll returns a list, need to loop through or index to get specific tags

In [26]:
soup.findAll('p')[0].get_text()

'Here is some simple content for this page.'

Can also use .find() method to find 1st instance of tag.  This returns a single object

In [27]:
soup.find('p')

<p>Here is some simple content for this page.</p>

In [28]:
soup.find('p').get_text()

'Here is some simple content for this page.'

# Searching for Tags by Class and ID

In [33]:
page = requests.get("http://dataquestio.github.io/web-scraping-pages/ids_and_classes.html")

In [34]:
page

<Response [200]>

In [35]:
page.status_code

200

url was successfully downloaded

In [36]:
soup = BeautifulSoup(new_page.content,'html.parser')

In [37]:
soup

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<div>
<p class="inner-text first-item" id="first">
                First paragraph.
            </p>
<p class="inner-text">
                Second paragraph.
            </p>
</div>
<p class="outer-text first-item" id="second">
<b>
                First outer paragraph.
            </b>
</p>
<p class="outer-text">
<b>
                Second outer paragraph.
            </b>
</p>
</body>
</html>

In [40]:
print(soup.prettify())

<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <div>
   <p class="inner-text first-item" id="first">
    First paragraph.
   </p>
   <p class="inner-text">
    Second paragraph.
   </p>
  </div>
  <p class="outer-text first-item" id="second">
   <b>
    First outer paragraph.
   </b>
  </p>
  <p class="outer-text">
   <b>
    Second outer paragraph.
   </b>
  </p>
 </body>
</html>


Now we will use findAll to find all p tags with outer class

In [46]:
soup.find_all('p',class_ = 'outer-text')

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>, <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

Now we will use findAll to find all tags with class 'outer-text'

In [45]:
soup.find_all(class_ = 'outer-text')

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>, <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [47]:
soup.find_all(id = "first")

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>]

# CSS selector system for finding tags

p a — finds all a tags inside of a p tag. 
body p a — finds all a tags inside of a p tag inside of a body tag.
html body — finds all body tags inside of an html tag.
p.outer-text — finds all p tags with a class of outer-text.
p#first — finds all p tags with an id of first.
body p.outer-text — finds any p tags with a class of outer-text inside of a body tag.

In [51]:
print(soup.prettify())

<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <div>
   <p class="inner-text first-item" id="first">
    First paragraph.
   </p>
   <p class="inner-text">
    Second paragraph.
   </p>
  </div>
  <p class="outer-text first-item" id="second">
   <b>
    First outer paragraph.
   </b>
  </p>
  <p class="outer-text">
   <b>
    Second outer paragraph.
   </b>
  </p>
 </body>
</html>


In [48]:
soup.select("p a")

[]

In [49]:
soup.select("html body p")

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>, <p class="inner-text">
                 Second paragraph.
             </p>, <p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>, <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [50]:
soup.select("html body p.outer-text")

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>, <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]