In [1]:
from urllib.request import urlopen
from urllib.error import HTTPError

In [13]:
html = urlopen('http://pythonscraping.com/pages/page1.html')
my_html = urlopen('https://pasinsiri.github.io/WebScraping/.')

In [14]:
#print(html.read())
print(my_html.read())

b'<!DOCTYPE html>\n<html>\n    <head>\n        <title>This is the title</title>\n    </head>\n\n    <body>\n        <div id="header" class="my-class">\n            <h1 class="my-class">This is the h1 heading within the div tag</h1>\n            <h2 class="my-class">This is the h2 heading within the div tag</h2>\n        </div>\n        <h1>This is the h1 heading <br> and before this is a break tag</h1>\n        <h1>Welcome to the <span class="red">tutorial</span></h1>\n        <h2>This is the h2 heading</h2>\n        <h3>This is the h3 heading</h3>\n        <hr>\n        <p>This is a paragraph, with a horizontal line tag above it</p>\n        <p>This is another paragraph</p>\n        <img src="sample_image.jpg" alt="this is the image description" width="200px" />\n        <a href="http://google.com"> This is a hyperlink tag, click here to go to Google</a>\n        <ul>\n            <li>Bullet 1</li>\n            <li>Bullet 2</li>\n        </ul>\n        <ol>\n            <li>Number 1</

# BeautifulSoup

In [6]:
from bs4 import BeautifulSoup

In [12]:
html = urlopen('http://pythonscraping.com/exercises/exercise1.html')
bs_obj = BeautifulSoup(html.read())
print(bs_obj.h1)

<h1>An Interesting Title</h1>


Not all URLs are available to download, so it'll be better if we prepare for error case in that situation.

In [6]:
# In case that we download an error url.
try:
    html = urlopen('http://pythonscraping.com/exercises/exercise9.html')
except HTTPError as e:
    print(e) # Return null, break, or do some other things.
# else:
    # Program continues, note that if you return or break in the
    # exception, no need to use 'else' statement

HTTP Error 404: Not Found


If a server is not found, the urlopen will return NULL. Thus, it'd be better if we prepare for this too.

In [7]:
if html is None:
    print('URL is not found')

In some cases, it is good to check whether the tag that you want to access is really exist before you will do anything further. If you try to access the non-existent tag, the BeautifulSoup will return a None object, and if you try to access the None object, it will result in an AttributeError.

In [8]:
print(bs_obj.find('nonExistentTag'))

None


In [9]:
# If we try to access this None object...
bs_obj.find('nonExistentTag').someTag

AttributeError: 'NoneType' object has no attribute 'someTag'

So now we will prevent these errors.

In [10]:
try:
    bad_content = bs_obj.find('nonExistentTag').anotherTag
except AttributeError as e:
    print('Tag was not found')
else:
    if bad_content == None:
        print('Tag was not found')
    else:
        print(bad_content)

Tag was not found


And let's write a function with those actions above.

In [11]:
def get_title(url):
    try:
        html = urlopen(url)
    except HTTPError as e:
        return None
    
    try:
        bs_obj = BeautifulSoup(html.read())
        title = bs_obj.body.h1
    except AttributeError as e:
        return None

    return title

In [12]:
title = get_title("http://www.pythonscraping.com/exercises/exercise1.html")
if title == None:
    print('Title could not be found')
else:
    print(title)

<h1>An Interesting Title</h1>
