In [1]:
import requests
from bs4 import BeautifulSoup as bs

In [8]:
r = requests.get("https://keithgalli.github.io/web-scraping/example.html")
soup = bs(r.content)
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



# Webscaping using html

### using find and find_all

In [9]:
first_header = soup.find("h2")
print(first_header)
headers = soup.find_all("h2")
print(headers)

<h2>A Header</h2>
[<h2>A Header</h2>, <h2>Another header</h2>]


In [11]:
#passing a list to the find, find_all
first_header = soup.find(["h1","h2"])
print(first_header)

headers = soup.find_all(["h1","h2"])
print(headers)

<h1>HTML Webpage</h1>
[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]


In [13]:
# passing attributes to find find_all
paragraph = soup.find_all("p", attrs={"id":"paragraph-id"})
print(paragraph)

[<p id="paragraph-id"><b>Some bold text</b></p>]


In [16]:
# you can nest find/find_all calls
body = soup.find("body")
body
div = body.find('div')
div

header = div.find('h1')
header

<h1>HTML Webpage</h1>

In [19]:
#you can specify strings in find / find_all calls
pr = soup.find_all("p", string="Some bold text")
pr

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [23]:
#using regex
import re
pr = soup.find_all("p", string= re.compile("Some"))
pr = soup.find_all("h2", string= re.compile("(H|h)eader"))
pr

[<h2>A Header</h2>, <h2>Another header</h2>]

## Select (similar to CSS Selector)

In [26]:
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [30]:
# https://www.w3schools.com/cssref/css_selectors.asp
content = soup.select("p")
# print(content)

# selecting a para on same level of h2 i.e. just after h2   i.e every p preceeded by h2
paragraphs= soup.select("h2 ~ p")
paragraphs

# selecting an element within another element with id   element#element-id element
# bold_t = soup.select("p#paragraph-id b")
bold_t = soup.select("#paragraph-id b")  #this also works
bold_t

[<b>Some bold text</b>]

In [33]:
#nested selects are also possible
paragraphs = soup.select("body > p")
print(paragraphs)

for para in paragraphs:
    print(para.select("i"))
    
m = soup.select("[align=middle]")
m

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<i>Some italicized text</i>]
[]


[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>]

# Getting Diffent properties of HTML

In [40]:
header = soup.find("h2")
# print(header) 
print(header.string)   # only one child

div = soup.find("div") 
# print(div)
print(div.string) # the reason why its none is because the function string doesn't know which to include when there are multiple at same level thus we use get_text()
print(div.get_text())  # multiple childs

None

HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



## more examples use https://keithgalli.github.io/web-scraping/webpage.html

In [55]:
# getting a specific tag like href, src , id, class etc.

link = soup.find("a")
# print(link)
# print(link.find("href")) #Cant use like this
# print(link.select("a> href")) #Cant use like this
link["href"]  # this works

paragraphs = soup.select("p#paragraph-id")
print(paragraphs)
print(paragraphs[0]["id"])

[<p id="paragraph-id"><b>Some bold text</b></p>]
paragraph-id


## Code Navigation

In [60]:
# path syntax
soup.body.div.h1.string

'HTML Webpage'

## Parent , Children and Siblings 
## https://www.crummy.com/software/BeautifulSoup/bs4/doc/

In [63]:
# lets find all siblings of an element

soup.body.find("div").find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]