#### Load Necessary Libraries

In [2]:
# We import requests so we load the web pages
import requests
from bs4 import BeautifulSoup as bs
# https://www.crummy.com/software/BeautifulSoup/bs4/doc/ WE can read the beautiful soup doc for more information

##### Load the webpage content

In [3]:
r = requests.get('https://keithgalli.github.io/web-scraping/example.html')

##### Convert our web page to a beautiful page object

In [4]:
soup = bs(r.content)

# We can use prettify to enhance the layout of output
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



#### Start Using Beautiful Soup to Scrape

##### find and find_all

In [43]:
# The find command finds the first element that matches irs given string arg
first_header = soup.find('h2')
# We can also do a simple soup.h2 and ti works like the find function and we can next even more like soup.body.div.p

# The find_all returns a list of all elements that match the given arg
headers = soup.find_all('h2')
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

In [10]:
# We can pass in a list of element to look for
# The order in the list does not matter, whatever we put in the list, it finds the first occurence of any of the item that comes first
hirst_header = soup.find(['h1', 'h2'])

headers = soup.find_all(['h1', 'h2'])
headers

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [15]:
# We can find find an element by its attribute
paragraph = soup.find_all('p', attrs={'id': 'paragraph-id'})
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [21]:
# We can next find and find_all calls
body = soup.find('body')
div = body.find('div')
header = div.find('h1')
header

<h1>HTML Webpage</h1>

In [35]:
# We can search for a specific string in our find/findall calls
# With thois, we have to pass in the exact string i.e the exact complete textContent of the tag
string_search = soup.find_all('p', string='Some bold text')

# With this we can pass in even a single string and it finds it using regex and it is still case sensitive
import re
paragraph = soup.find_all('p', string=re.compile('Some'))

# Findh case insensitive
headerz = soup.find_all('h2', string=re.compile('(H|h)eader'))
headerz

[<h2>A Header</h2>, <h2>Another header</h2>]

##### Select (CSS Selector)

In [48]:
# The select function returns a list
# content = soup.select('p')

#  We can next, i.e if we are looking for a p tag in div
content = soup.select('div p')
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [45]:
# Paragraphs directly after h2, i.e of the same parent, next sibling
paragraph_s = soup.select('h2 ~ p')
paragraph_s

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [50]:
bold_text = soup.select('p#paragraph-id b')
bold_text

[<b>Some bold text</b>]

In [63]:
# We can make nexted calls
# body > p, direct descendant of the body tag
paragra_ph = soup.select('body > p')
print(paragra_ph[1].b)

for p in paragra_ph:
    print(p.select('i'))

<b>Some bold text</b>
[<i>Some italicized text</i>]
[]


In [64]:
# Grab an element by its attribute
soup.select('[align=middle]')

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>]

#### Getting Different Properties of the HTML

In [71]:
# Getting the string within an element
# textContent of a particular element without its children involved
h = soup.find('h2')
h.string
d = soup.find('div')
print(d.string)

# For multipe child element
# To get all the textContent inside a container, be it in the element itself or its children element, we use:
print(d.get_text())


None

HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [81]:
# Get a specific property from an element
link = soup.find('a')
print(link['href'])

par = soup.select('#paragraph-id')
par[-1]['id']

https://keithgalli.github.io/web-scraping/webpage.html


'paragraph-id'

In [82]:
# Path syntax
soup.body.div.h1.string

'HTML Webpage'

In [87]:
# Know the terms parent, sibling and child
# find_next_slblings() returns every sibling aftet the element it is called on
# We can check the doc for more of these functions
soup.body.find('div').find_next_siblings() 

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]