In [26]:
from urllib.request import urlopen
from urllib.error import HTTPError

In [3]:
html = urlopen('http://pythonscraping.com/pages/page1.html')

In [4]:
print(html.read())

b'<html>\n<head>\n<title>A Useful Page</title>\n</head>\n<body>\n<h1>An Interesting Title</h1>\n<div>\nLorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\n</div>\n</body>\n</html>\n'


# BeautifulSoup

In [5]:
from bs4 import BeautifulSoup

In [17]:
html = urlopen('http://pythonscraping.com/exercises/exercise1.html')
bs_obj = BeautifulSoup(html.read())
print(bs_obj.h1)

<h1>An Interesting Title</h1>


Not all URLs are available to download, so it'll be better if we prepare for error case in that situation.

In [27]:
# In case that we download an error url.
try:
    html = urlopen('http://pythonscraping.com/exercises/exercise9.html')
except HTTPError as e:
    print(e) # Return null, break, or do some other things.
# else:
    # Program continues, note that if you return or break in the
    # exception, no need to use 'else' statement

HTTP Error 404: Not Found


If a server is not found, the urlopen will return NULL. Thus, it'd be better if we prepare for this too.

In [19]:
if html is None:
    print('URL is not found')

In some cases, it is good to check whether the tag that you want to access is really exist before you will do anything further. If you try to access the non-existent tag, the BeautifulSoup will return a None object, and if you try to access the None object, it will result in an AttributeError.

In [21]:
print(bs_obj.find('nonExistentTag'))

None


In [24]:
# If we try to access this None object...
bs_obj.find('nonExistentTag').someTag

AttributeError: 'NoneType' object has no attribute 'someTag'

So now we will prevent these errors.

In [25]:
try:
    bad_content = bs_obj.find('nonExistentTag').anotherTag
except AttributeError as e:
    print('Tag was not found')
else:
    if bad_content == None:
        print('Tag was not found')
    else:
        print(bad_content)

Tag was not found


And let's write a function with those actions above.

In [28]:
def get_title(url):
    try:
        html = urlopen(url)
    except HTTPError as e:
        return None
    
    try:
        bs_obj = BeautifulSoup(html.read())
        title = bs_obj.body.h1
    except AttributeError as e:
        return None

    return title

In [30]:
title = get_title("http://www.pythonscraping.com/exercises/exercise1.html")
if title == None:
    print('Title could not be found')
else:
    print(title)

<h1>An Interesting Title</h1>
