# Beautiful Soup
## Tags

In [8]:
import requests
from bs4 import BeautifulSoup


response = requests.get("https://stackoverflow.com/questions")
response.status_code

200

In [9]:
# extract the text directly from soup
soup = BeautifulSoup(response.text, "html.parser")  

## lists
- ul unordered element
- ol ordered element
- li list item

In [1]:
# soup.find('ul')

## tables
- table is the element
- tr is table row element
- td is table data element contained inside the row - child of tr

In [51]:
print(soup.find('table'))

None


## find paragraphs
- use p tags
- use  select function

In [52]:
soup.select('p')

[<p class="mb0 lh-lg">
                     By using our site, you acknowledge that you have read and understand our <a class="s-link s-link__inherit td-underline fc-white" href="https://stackoverflow.com/legal/cookie-policy" target="_blank">Cookie Policy</a>, <a class="s-link s-link__inherit td-underline fc-white" href="https://stackoverflow.com/legal/privacy-policy" target="_blank">Privacy Policy</a>, and our <a class="s-link s-link__inherit td-underline fc-white" href="https://stackoverflow.com/legal/terms-of-service/public" target="_blank">Terms of Service</a>.
                 </p>,
 <p class="mb2"><strong>Teams</strong></p>,
 <p class="mb16 fs-caption fc-medium">Q&amp;A for Work</p>,
 <p class="mb8 fs-caption fc-medium">
 
                             Stack Overflow for Teams is a private, secure spot for you and
                             your coworkers to find and share information.
                                         </p>,
 <p class="mt-auto mb24">
 site design / logo ©

# select divisions
- use tags
- use function select

In [2]:
# soup.select('div')

## hyperlink 
- 'a' tag

In [54]:
# select a tag within a tag
# select a within p
soup.select('p a')

[<a class="s-link s-link__inherit td-underline fc-white" href="https://stackoverflow.com/legal/cookie-policy" target="_blank">Cookie Policy</a>,
 <a class="s-link s-link__inherit td-underline fc-white" href="https://stackoverflow.com/legal/privacy-policy" target="_blank">Privacy Policy</a>,
 <a class="s-link s-link__inherit td-underline fc-white" href="https://stackoverflow.com/legal/terms-of-service/public" target="_blank">Terms of Service</a>,
 <a href="https://stackoverflow.com/help/licensing">cc by-sa</a>]

In [3]:
# # select multiple tags
# # finds a and p
soup.select('p, a')
pass

## Select Classes
- the dot means it's a class
- we get an iterable and get the first item to see type
- each object is an instance of the class

In [56]:
questions = soup.select(".question-summary")  
# print(questions[0])
print(type(questions[0]))
print(questions[0].attr)

<class 'bs4.element.Tag'>
None


## select classes with space

In [57]:
url = 'http://www.marketwatch.com/'
user_headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(url, headers = user_headers)
soup = BeautifulSoup(response.content, 'html.parser')

# use . before the class name
soup.select('.region region--full')

[]

In [59]:
## if you have a class with a space
# replace all spaces with dots
# the above changes to
soup.select('.region.region--full')
pass

In [10]:
# select class with tag
# preceed the . wth the tag
soup.select('div.region.region--full')
pass

In [11]:
# find and find_all
soup.find_all('div')
pass

In [12]:
# no . neded in spaces inside ""
soup.find_all('div', class_ = 'region region--full')
pass

In [13]:
# find a division with specific class
# no . neded in spaces
soup.find_all('div', class_ = 'region region--full')
pass

## images

In [67]:
# image
# images are usually in img src
# this example is i data-srcset and it's a set
soup.find('img')

<img alt="Read full story" class="lazyload" data-sizes="auto" data-srcset="https://s.marketwatch.com/public/resources/images/MW-DD868_casino_ZR_20150122054110.jpg 220w,https://s.marketwatch.com/public/resources/images/MW-DD868_casino_ZT_20150122054110.jpg 300w,https://s.marketwatch.com/public/resources/images/MW-DD868_casino_ZQ_20150122054110.jpg 460w,https://s.marketwatch.com/public/resources/images/MW-DD868_casino_ZH_20150122054110.jpg 800w,https://s.marketwatch.com/public/resources/images/MW-DD868_casino_ZG_20150122054110.jpg 1240w"/>

In [68]:
# search for soups's instances - use get
soup.find('img').get('data-srcset')

'https://s.marketwatch.com/public/resources/images/MW-DD868_casino_ZR_20150122054110.jpg 220w,https://s.marketwatch.com/public/resources/images/MW-DD868_casino_ZT_20150122054110.jpg 300w,https://s.marketwatch.com/public/resources/images/MW-DD868_casino_ZQ_20150122054110.jpg 460w,https://s.marketwatch.com/public/resources/images/MW-DD868_casino_ZH_20150122054110.jpg 800w,https://s.marketwatch.com/public/resources/images/MW-DD868_casino_ZG_20150122054110.jpg 1240w'

## find by id - CHECK 

In [36]:
# find division by id
# <div class="section" id="advanced-usage">
# use select and separate class and if with #
# to separate id use #
soup.select('div#notify-container')

[]