# Creating a BeautifulSoup object

In [3]:
from bs4 import BeautifulSoup

In [7]:
# Creating a BeautifulSoup object from a string

helloworld = "<p>Hello World</p>"
soup_string = BeautifulSoup(helloworld)
print(soup_string)

<html><body><p>Hello World</p></body></html>


In [8]:
# Creating a BeautifulSoup object from a file-like object

import urllib.request

url = "https://www.ted.com/"
page = urllib.request.urlopen(url)
soup_package = BeautifulSoup(page)
print(soup_package)

with open("foo.html","r") as foo_file:
    soup_foo = BeautifulSoup(foo_file)
print(soup_foo)

<!DOCTYPE html>
<!--[if lt IE 8]> <html class="no-js loggedout oldie ie7" lang="en"> <![endif]--><!--[if IE 8]> <html class="no-js loggedout oldie ie8" lang="en"> <![endif]--><!--[if gt IE 8]><!--><html class="no-js loggedout" lang="en"><!--<![endif]-->
<head>
<script>
  (function (H){
  H.className=H.className.replace(/\bno-js\b/,'js');
  if (('; '+document.cookie).match(/; _ted_user_id=/)) H.className=H.className.replace(/\bloggedout\b/,'loggedin');
  })(document.documentElement)
</script><meta charset="utf-8"/>
<title>TED: Ideas worth spreading</title>
<meta content="TED Talks are influential videos from expert speakers on education, business, science, tech and creativity, with subtitles in 100+ languages. Ideas free to stream and download." name="description"/>
<meta content="https://pi.tedcdn.com/r/pl.tedcdn.com/social/ted-logo-fb.png?v=wAff13s?" property="og:image"/>
<meta content="F1F8CFF85FF82D607911A8BAB418B939" name="msvalidate.01"/>
<meta content="71afe64571a4f2fbc27fbbad7ff

# Creating a BeautifulSoup object for XML parsing

In [10]:
soup_xml = BeautifulSoup(helloworld, features= "xml")
print(soup_xml)

<?xml version="1.0" encoding="utf-8"?>
<p>Hello World</p>


In [12]:
invalid_html = '<a invalid content'

soup_invalid_html_lxml = BeautifulSoup(invalid_html, 'lxml')
print(soup_invalid_html_lxml)

soup_invalid_html_html5lib = BeautifulSoup(invalid_html, 'html5lib')
print(soup_invalid_html_html5lib)

soup_invalid_html_htmlparser = BeautifulSoup(invalid_html, 'html.parser')
print(soup_invalid_html_htmlparser)


<html><body><a content="" invalid=""></a></body></html>
<html><head></head><body></body></html>
&lt;a invalid content


# Tag
## Accessing the Tag object from BeautifulSoup

In [46]:
html_atag = """"<html><body><p>Test html a tag example</p>
<a href="http://www.packtpub.com">Home</a>
<a href="http://www.packtpub.com/books">Books</a>
</body>
</html>"""
soup_atag = BeautifulSoup(html_atag,'lxml')
atag = soup_atag.a
print(atag)

<a href="http://www.packtpub.com">Home</a>


## Name of the Tag object

In [47]:
tagname = atag.name
print (tagname)

a


In [40]:
atag.name = 'p'
print(soup)

<html><body><p>"</p><p>Test html a tag example</p>
<p href="http://www.packtpub.com">Home</p>
<p href="http://www.packtpub.com/books">Books</p>
</body></html>


## Attributes of a Tag object

In [48]:
atag = soup_atag.a
print (atag['href'] )

print(atag.attrs)

http://www.packtpub.com
{'href': 'http://www.packtpub.com'}


## The NavigableString object

In [49]:
first_a_string = soup_atag.a.string
print(first_a_string)

Home


# Summary
In this chapter, we learned the different objects in the Beautiful Soup module. We
understood how the HTML/XML document is converted to a BeautifulSoup
object with the help of underlying TreeBuilders. We also had a look at the creation
of BeautifulSoup by passing a string and a file object (for a local file and URL).
Creating BeautifulSoup for XML parsing and the use of the features argument in
the constructor were also explained. We saw how the different tags and texts within
the HTML/XML document are represented as a Tag and NavigableString object in
Beautiful Soup.
In the next chapter, we will learn the different searching methods, such as find(),
find_all(), and find_next(), provided by Beautiful Soup. With the help of these
searching methods, we will be able to get data out of the HTML/XML document,
which is indeed the most powerful feature of Beautiful Soup.