# CodeLabs [Introduction to BeautifulSoup 4 (bs4)](https://eueung.github.io/python/bs4)

# Example #1

In [None]:
html_doc = """<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>"""

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')

print(soup.prettify())

# Example #2

In [None]:
print soup.title             # <title>The Dormouse's story</title>
print soup.title.name        # u'title'
print soup.title.string      # u'The Dormouse's story'
print soup.title.parent.name # u'head'

print soup.p                 # <p class="title"><b>The Dormouse's story</b></p>
print soup.p['class']        # u'title'
print soup.a                 # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

print soup.find(id="link3")  # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
print soup.find_all('a')     # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
                             #  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
                             #  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [None]:
for link in soup.find_all('a'):
    print(link.get('href'))

In [None]:
print(soup.get_text())

# Example #3

In [None]:
head_tag = soup.head
print head_tag           # <head><title>The Dormouse's story</title></head>
print head_tag.contents  # [<title>The Dormouse's story</title>]
print head_tag.string    # u'The Dormouse's story'

for child in head_tag.descendants:  # <title>The Dormouse's story</title>
    print(child)                    # The Dormouse's story
# -------
title_tag = head_tag.contents[0]
print title_tag          # <title>The Dormouse's story</title>
print title_tag.contents # [u'The Dormouse's story']
print title_tag.string   # u'The Dormouse's story'

text = title_tag.contents[0]
# print text.contents    # AttributeError: 'NavigableString' object has no attribute 'contents'

for child in title_tag.children:
    print(child)  # The Dormouse's story

# Example #4

In [None]:
# print soup.contents
print len(soup.contents)          # 1
print soup.contents[0].name       # u'html'

print len(list(soup.children))    # 1
print len(list(soup.descendants)) # 25
# --------
print(soup.html.string) # None

#for string in soup.strings:
    # print(repr(string))
    
for string in soup.stripped_strings:
    print(repr(string))    

# Example #5

In [None]:
soup = BeautifulSoup('<b class="boldest">Extremely bold</b>')
tag = soup.b
print type(tag)
print tag.name          # u'b'

tag.name = "blockquote"
print tag               # <blockquote class="boldest">Extremely bold</blockquote>
print tag['class']      # u'boldest'
print tag.attrs         # {u'class': u'boldest'}

tag['class'] = 'verybold'
tag['id'] = 1
print tag               # <blockquote class="verybold" id="1">Extremely bold</blockquote>
# -------------
del tag['class']
del tag['id']
print tag               # <blockquote>Extremely bold</blockquote>

#print tag['class']     # KeyError: 'class'
print(tag.get('class')) # None

# Example #6

In [None]:
class_soup = BeautifulSoup('<p class="body strikeout"></p>')
print class_soup.p['class']  # ["body", "strikeout"]

class_soup = BeautifulSoup('<p class="body"></p>')
print class_soup.p['class']  # ["body"]

id_soup = BeautifulSoup('<p id="my id"></p>')
print id_soup.p['id']      # 'my id'

# ----------

rel_soup = BeautifulSoup('<p>Back to the <a rel="index">homepage</a></p>')
print rel_soup.a['rel']    # ['index']

rel_soup.a['rel'] = ['index', 'contents']
print(rel_soup.p)          # <p>Back to the <a rel="index contents">homepage</a></p>

# ----------

xml_soup = BeautifulSoup('<p class="body strikeout"></p>', 'xml')
print xml_soup.p['class']  # u'body strikeout'

##References
- [Beautiful Soup Documentation — Beautiful Soup 4.4.0 documentation](http://www.crummy.com/software/BeautifulSoup/bs4/doc/)

##Other Readings
- [Web Scraping with Beautiful Soup](http://web.stanford.edu/~zlotnick/TextAsData/Web_Scraping_with_Beautiful_Soup.html)
- [Really short intro to scraping with Beautiful Soup and Requests](https://gist.github.com/bradmontgomery/1872970)