# Web scrapping intro

In [1]:
# import necessary libraries
import requests # pip install if not already
from bs4 import BeautifulSoup as bs # pip install BeautifulSoup if not already installed


In [2]:
# Load our first page

In [8]:
url = 'https://keithgalli.github.io/web-scraping/example.html'
r = requests.get(url)

#what does r look like?
r, type(r)

(<Response [200]>, requests.models.Response)

In [14]:
str(r.content)

'b\'<html>\\n<head>\\n<title>HTML Example</title>\\n</head>\\n<body>\\n\\n<div align="middle">\\n<h1>HTML Webpage</h1>\\n<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>\\n</div>\\n\\n<h2>A Header</h2>\\n<p><i>Some italicized text</i></p>\\n\\n<h2>Another header</h2>\\n<p id="paragraph-id"><b>Some bold text</b></p>\\n\\n</body>\\n</html>\\n\''

In [20]:
# convert r to a beautiful soup object
soup = bs(r.content)
soup

<html>
<head>
<title>HTML Example</title>
</head>
<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>
</html>

In [21]:
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



In [24]:
# start scrapping!
# start with find and find_all

soup.find('h2'), soup.find_all('h2')

(<h2>A Header</h2>, [<h2>A Header</h2>, <h2>Another header</h2>])

In [27]:
# storing as variables
first_header=soup.find('h2')
all_headers=soup.find_all('h2')
first_header, all_headers

(<h2>A Header</h2>, [<h2>A Header</h2>, <h2>Another header</h2>])

In [30]:
# pass in a list of elements to look for
first_header=soup.find(['h1','h2'])
first_header

<h1>HTML Webpage</h1>

In [32]:
headers=soup.find_all(['h1','h2'])
headers

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [34]:
# you can pass in atributes to the find/find functions.
paragraph=soup.find_all("p")
paragraph

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [35]:
# say we want to find the paragraph that has the attribute id='paragraph-id', use the parameter attr={}
# to find this attribute with the paragraph tag.

soup.find_all("p", attrs={"id": "paragraph-id"} )

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [38]:
# you can nest find and find all calls.
body = soup.find('body')
body

<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>

now lets say you want to find a 'div' tag, you can nest this with the body object just created

In [40]:
div =body.find('div')
div

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

now lets do the same process for the header

In [42]:
header=div.find('h1')
header

<h1>HTML Webpage</h1>

In [43]:
# we can search for specific strings in find_all calls.
# say we wanted to find paragraphs that contained the word 'some'

print(body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [49]:
some_paragraphs=body.find_all('p', string='Some')
some_paragraphs

[]

oh no! oh no! Oh no no no no!

In [53]:
# it doesn't quite work...what if we put in 'Some bold text'??
some_paragraphs=body.find_all('p', string='Some bold text')
some_paragraphs

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [58]:
# this situation is not ideal.
# let's leverage our friend regex :)

import re

some_paragraphs=body.find_all('p', string=re.compile('Some'))
some_paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [62]:
# another example find all headers that have the word "header" in them

headers=body.find_all('h2', string=re.compile('header'))
headers

[<h2>Another header</h2>]

only finds one result because regex is looking for 'header'. we can change it by giving it a different pattern to capture the capital "H", string=re.compile('(H|h)eader')

In [63]:
headers=body.find_all('h2', string=re.compile('(H|h)eader'))
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

### select (CSS selector)

In [69]:
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [64]:
soup.select('p')

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

looks the same as find_all. Let's try finding all paragraph tags inside of div.

In [71]:
soup.select('div p')

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [72]:
# select all the paragraphs that are also preceeded by a header.
soup.select('h2 ~ p')


[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [73]:
# let's do some more of this. It's useful to grab elements with specific id's

In [74]:
soup.select('p#paragraph-id b')

[<b>Some bold text</b>]

In [75]:
paragraphs=soup.select('body > p')
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [76]:
# we can loop through and make select calls with these objects that we just made. Let's take paragraphs for example
#first look at the type(paragraphs)

type(paragraphs)

list

In [78]:
# it's a list so we can loop. Let's inspect the elements in the list for their types
type(paragraphs[0])

bs4.element.Tag

They are beautiful soup objects so we can use the select and find_all calls. Let's do an example of finding the elements with the id tag.

In [82]:
paragraphs_i=[]
[paragraphs_i.append(x.select('i')) for x in paragraphs]
paragraphs_i

[[<i>Some italicized text</i>], []]

In [84]:
# Grab an element with a specific property
soup.select("[align='middle']")

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>]

In [None]:
# Get different properties of the HTML